In [34]:
from exiftool import *
import glob
import os
import shutil
import sys
import pandas as pd

### Set the Work directories

In [80]:
image_dir = "/mnt/c/Users/bhatta53/Documents/drive/OneDrive - Nanyang Technological University/personal/redmi_note_7_backup_2021_06/Images"
work_dir = "/mnt/c/Users/bhatta53/Documents/tmp_image/"
metadata_file = work_dir + 'metadata.pickle'

In [81]:
if not os.path.isdir(work_dir):
    os.mkdir(work_dir)
print(f"Work dir: {work_dir}")

# instantiate exiftool


# discover the files 
image_ext = ['jpg', 'jpeg', 'nef', 'png', 'cr2', 'pef']
all_metadata = list()
for ext in image_ext:
    search_pattern = image_dir + '**/*.' 
    for ch in ext: 
        up = ch.upper()
        low = ch.lower()
        search_pattern = search_pattern + '[' + low + up+']'
    print(search_pattern)
    files = glob.glob(search_pattern, recursive=True)
    print(f'ext {ext}: {len(files)}')
    
  
    if files != []:
        # get the metadata using exiftool
        with exiftool.ExifTool() as et:
            metadata = et.get_metadata_batch(files)
        all_metadata = all_metadata + metadata 

import pickle
with open(metadata_file, 'wb') as handle:
    pickle.dump(all_metadata, handle, protocol=pickle.HIGHEST_PROTOCOL)

Work dir: /mnt/c/Users/bhatta53/Documents/tmp_image/
/mnt/c/Users/bhatta53/Documents/drive/OneDrive - Nanyang Technological University/personal/redmi_note_7_backup_2021_06/Images**/*.[jJ][pP][gG]
ext jpg: 10566
/mnt/c/Users/bhatta53/Documents/drive/OneDrive - Nanyang Technological University/personal/redmi_note_7_backup_2021_06/Images**/*.[jJ][pP][eE][gG]
ext jpeg: 2404
/mnt/c/Users/bhatta53/Documents/drive/OneDrive - Nanyang Technological University/personal/redmi_note_7_backup_2021_06/Images**/*.[nN][eE][fF]
ext nef: 0
/mnt/c/Users/bhatta53/Documents/drive/OneDrive - Nanyang Technological University/personal/redmi_note_7_backup_2021_06/Images**/*.[pP][nN][gG]
ext png: 0
/mnt/c/Users/bhatta53/Documents/drive/OneDrive - Nanyang Technological University/personal/redmi_note_7_backup_2021_06/Images**/*.[cC][rR][22]
ext cr2: 0
/mnt/c/Users/bhatta53/Documents/drive/OneDrive - Nanyang Technological University/personal/redmi_note_7_backup_2021_06/Images**/*.[pP][eE][fF]
ext pef: 0


In [99]:
# create an in-memory dataframe 
import pandas as pd 
import datetime 
with open(metadata_file, 'rb') as handle:
    metadata = pickle.load(handle)

complete_data = dict()
meta_tags = ['filename', 'width', 'height', 'timestamp']
for d in meta_tags:
   complete_data[d] = [] 

for data in metadata:
    
    f = data["SourceFile"]
    if 'File:ImageHeight' in data.keys():
        height, width  = data['File:ImageHeight'], data['File:ImageWidth']
    else:
        height, width = 0,0
    
    
    if "EXIF:DateTimeOriginal" in data.keys():
        ts = data["EXIF:DateTimeOriginal"]
    elif 'EXIF:GPSTimeStamp' in data.keys():
        # 2016:07:30 18:50:27
        ts = '{} {}'.format(data['EXIF:GPSDateStamp'],data['EXIF:GPSTimeStamp'])
    else:
        ts = ''
    complete_data['filename'].append(f)
    complete_data['width'].append(width)
    complete_data['height'].append(height)
    complete_data['timestamp'].append(ts)

df = pd.DataFrame(complete_data)

# for index, row in df.iterrows():
#     print(row['timestamp'])
    

def event_detector(df, threshold, start_date, end_date, time_dir, dummy=True):
    ''' from the dataframe, extract events. 
        An event is a day which has >= threshold images. 
        If consecutive days have events, then they are merged into a single event.
        The files are copied to the time_dir. dummy should be set to False to actually
        copy the files.
    '''
    sorted_df = df.sort_values(['timestamp'])
    mask = (df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)
    relevant_df = sorted_df.loc[mask]
    
    event_dict = dict()
    keys = ['start_date', 'end_date', 'event_count']
    for k in keys:
        event_dict[k] = []
    
    # lets find events
    # event = a day with >= threshold number of photos 
    # if consecutive days >= threshold, then merge 
    curr_event = None 
    event_start = None
    event_end = None 
    last_date_str = None 
    for index, row in relevant_df.iterrows():
        curr_ts = row['timestamp'][:row['timestamp'].find(' ')]     
        curr_date = datetime.datetime.strptime(curr_ts, '%Y:%m:%d')
        end_date  = curr_date + datetime.timedelta(days=1)
        
        curr_date_str = curr_date.strftime('%Y:%m:%d')
        end_date_str = end_date.strftime('%Y:%m:%d')
        
        if curr_date_str == last_date_str:
            continue 
        
        today_events = (df['timestamp'] >= curr_date_str) & (df['timestamp'] < end_date_str)
        today_df = relevant_df.loc[today_events]
#         print(curr_date_str, len(today_df))
        event_count = len(today_df)
        if event_count >= threshold:
            # is it a new event ?
            if curr_event == None:
                curr_event = curr_date_str
                event_start = curr_date_str
                event_end = end_date_str
            # is it part of an old event
            else:
                event_end = curr_date_str
                
        else: # there were no events today
            if curr_event != None:
                event_dict['start_date'].append(event_start) 
                event_dict['end_date'].append(event_end) 
                events = (df['timestamp'] >= event_start) & (df['timestamp'] < event_end)
                event_count = 0
                for v in events:
                    if v:
                        event_count = event_count+1
                event_dict['event_count'].append(event_count)
                # create an event dir in the year dir 
                event_dir = time_dir + event_start.replace(':', '_')
                if not os.path.isdir(event_dir):
                    os.mkdir(event_dir)
                event_dir = event_dir + '/'
                event_df = relevant_df.loc[events]
                fc_ = 0
                for index, row in event_df.iterrows():
                    file = row['filename']
                    dest_file = os.path.basename(file)
                    ext = dest_file[dest_file.rfind('.'):]
                    ts = row['timestamp']
                    if ' ' in ts:
                        ts = ts[:ts.find(' ')]
                        ts = ts.replace(':', '_')
                    fname = ts + f'_e{fc_}'+ ext
                    fc_ = fc_ + 1
                    dest = event_dir +  fname
                    if not dummy:
                        shutil.copy2(file, dest)
                        
                # drop these images 
                df.drop()
                    
                curr_event = None 
            # copy the current day files to the year dir 
            fc_ = 0
            for index, row in today_df.iterrows():
                file = row['filename']
                dest_file = os.path.basename(file)
                ext = dest_file[dest_file.rfind('.'):]
                name = curr_date_str.replace(':', '_')
                if ' ' in name:
                    name = name[:name.find(' ')]
                fname = name + f'_{fc_}'+ ext
                dest = time_dir +  fname
#                 print(dest)
                fc_ = fc_ + 1
                if not dummy:
                    shutil.copy2(file, dest)
            
        
        last_date_str = curr_date_str
    print(start_date, event_dict['event_count'], event_dict['start_date'], event_dict['end_date'])
    
threshold = 15  
for year in range(2016,2022):
    # create a directory inside the work directory if not already present
    year_dir = work_dir + f'{year}'
    if not os.path.isdir(year_dir):
        os.mkdir(year_dir)
    event_detector(df, threshold, f'{year}:01:01',  f'{year}:12:31', year_dir+'/')        
  

    

2016:01:01 [22, 37, 23, 19, 27, 31, 25] ['2016:07:30', '2016:08:20', '2016:10:09', '2016:10:22', '2016:10:24', '2016:11:06', '2016:11:08'] ['2016:07:31', '2016:08:21', '2016:10:10', '2016:10:23', '2016:10:25', '2016:11:07', '2016:11:09']
2017:01:01 [] [] []
2018:01:01 [16, 19, 15, 170, 17, 19] ['2018:08:11', '2018:08:24', '2018:09:02', '2018:10:15', '2018:10:24', '2018:10:29'] ['2018:08:12', '2018:08:25', '2018:09:03', '2018:10:19', '2018:10:25', '2018:10:30']
2019:01:01 [101, 22, 16, 16, 18, 19, 18] ['2019:03:03', '2019:03:07', '2019:03:12', '2019:04:22', '2019:05:26', '2019:08:28', '2019:09:21'] ['2019:03:05', '2019:03:08', '2019:03:13', '2019:04:23', '2019:05:27', '2019:08:29', '2019:09:22']
2020:01:01 [15, 35, 83] ['2020:07:19', '2020:09:12', '2020:09:26'] ['2020:07:20', '2020:09:13', '2020:09:27']
2021:01:01 [17, 76, 23, 37, 36, 15, 69, 15, 36, 15, 21] ['2021:01:16', '2021:01:31', '2021:02:21', '2021:02:26', '2021:03:13', '2021:03:17', '2021:03:19', '2021:03:25', '2021:04:03', '20