In [23]:
import datetime
import pandas as pd
import exifread
import os
from datetime import timedelta
import shutil
from concurrent.futures import ThreadPoolExecutor
import math
import pyfastcopy

def list_files_in_directory(directory):
    file_paths = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg','.JPG')):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
            else:
                continue
    return file_paths

def convert_datetime(dt):
    try:
        dt = pd.to_datetime(dt, format='%Y-%m-%d %H-%M-%S')
        return dt.strftime('%Y%m%d_%H%M%S')
    except pd.errors.OutOfBoundsDatetime:
        return 'OutOfBoundsDatetime'

def clean_path(path):
    return os.path.normpath(path)

def process_image(image_path):
    with open(image_path, 'rb') as image_file:
        tags = exifread.process_file(image_file, details=False)
        
        directory = os.path.dirname(image_path)
        filename = os.path.basename(image_path)
        filetype_extension = os.path.splitext(filename)[1]
        make = tags.get('Image Make', 'N/A')
        model = tags.get('Image Model', 'N/A')
        datetime_original = tags.get('EXIF DateTimeOriginal', 'N/A')
        
        return {
            'SourceFile': image_path,
            'Directory': directory,
            'FileName': filename,
            'FileTypeExtension': filetype_extension,
            'Make': make,
            'Model': model,
            'DateTimeOriginal': datetime_original
        }

def read_exif(image_dir):
    file_paths = list_files_in_directory(image_dir)
    print(len(file_paths))
    with ThreadPoolExecutor(20) as executor:  # Adjust max_workers as needed
        image_metadata_list = list(executor.map(process_image, file_paths))
    exif_info = pd.DataFrame(image_metadata_list)
    return exif_info

def create_new_filenames(exif_info):
    exif_info = exif_info[exif_info["DateTimeOriginal"] != "N/A"]
    exif_info['Station'] = Station
    exif_info['Camera'] = Camera
    exif_info['DateTimeOriginal'] = pd.to_datetime(exif_info['DateTimeOriginal'], format='%Y:%m:%d %H:%M:%S')
    exif_info['FormattedDateTime'] = exif_info['DateTimeOriginal'].apply(convert_datetime)
    exif_info = exif_info.sort_values(by=['Station', 'Camera', 'DateTimeOriginal']).reset_index(drop=True)
    exif_info['diff'] = exif_info.groupby(['Station', 'Camera'])['DateTimeOriginal'].diff()
    exif_info['image_number']=exif_info.groupby(['Station','Camera']).cumcount()+1
    exif_info['Directory'] = exif_info['Directory'].apply(clean_path)
    exif_info['SourceFile'] = exif_info['SourceFile'].apply(clean_path)
    exif_info['Dest_subfolder_number'] = exif_info['image_number'].apply(lambda x: math.ceil(x / 10000)).astype(str)
    exif_info['Dest_Directory'] = (dest_dir + "\\" + exif_info['Dest_subfolder_number']).apply(clean_path)

    ### Add sequence number
    threshold = timedelta(seconds=1)
    Sequence = []
    for i in range(len(exif_info)):
        diff = exif_info['diff'][i]
        if pd.isna(diff) or diff > threshold:
            sequence = 1
        else:
            sequence = Sequence[i - 1] + 1
        Sequence.append(sequence)
    exif_info['Sequence'] = Sequence

    ### Construct new filename
    exif_info['FileNameNew'] = exif_info['Station'] + '_' + exif_info['Camera'] + '_' + exif_info['FormattedDateTime'] + '(' + exif_info['Sequence'].astype(str) + ')' + exif_info['FileTypeExtension']
    exif_info['DestFile'] = (exif_info['Dest_Directory'] + "\\" + exif_info['FileNameNew']).apply(clean_path)
    
    #copy_dir = []
    #for i in range(len(exif_info)):
        #d = exif_info['Directory'][i]
        #c = os.path.join(dest_drive, "\\".join(d.split("\\")[1:]))
        #copy_dir.append(c)

    #exif_info['copy_dir'] = copy_dir
    #exif_info['SourceFileNew_copy'] = exif_info.apply(lambda row: os.path.join(row['copy_dir'], row['FileNameNew']), axis=1)
    return exif_info

def rename_images(table):
    source_path = table['SourceFile']
    target_path = table['DestFile']
    os.rename(source_path, target_path)

def copy_images(table):
    src_files = table['SourceFile']
    dest_files = table['DestFile']
    with ThreadPoolExecutor(10) as exe:
        _ = [exe.submit(shutil.copy, src_path, dest_path) for src_path,dest_path in zip(src_files,dest_files)]

def copy_images_batch(table, batch_size=1000):
    src_files=table['SourceFile']
    dest_files=table['DestFile']
    with ThreadPoolExecutor(20) as exe:
        for i in range(0, len(src_files), batch_size):
            src_batch = src_files[i:i + batch_size]
            dest_batch = dest_files[i:i + batch_size]
            
            batch_tasks = [exe.submit(shutil.copy, src, dest) for src, dest in zip(src_batch, dest_batch)]
            # Wait for all tasks in the batch to complete before proceeding to the next batch
            _ = [task.result() for task in batch_tasks]
            print(f"First {i+1 * 1000} images copied at {datetime.datetime.now()}")

In [24]:
camera_dir = r"I:\Guzzler_data\2023\05092023-06122023(not yet included in above CameraTrap folder)\Gajaimata\Gajaimata1"
dest_dir = os.path.join(r"I:\\Camera_Trapping\\","\\".join(camera_dir.split("\\")[1:]))
print(dest_dir)
Station = camera_dir.split("\\")[-2]
print(Station)
Camera = camera_dir.split("\\")[-1]
print(Camera)

I:\\Camera_Trapping\\Guzzler_data\2023\05092023-06122023(not yet included in above CameraTrap folder)\Gajaimata\Gajaimata1
Gajaimata
Gajaimata1


In [25]:
start = datetime.datetime.now()
###Create Renaming Table
exif = read_exif(camera_dir)
renaming_table=create_new_filenames(exif)
end = datetime.datetime.now()
print(end - start)

5953


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exif_info['Station'] = Station
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exif_info['Camera'] = Camera
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exif_info['DateTimeOriginal'] = pd.to_datetime(exif_info['DateTimeOriginal'], format='%Y:%m:%d %H:%M:%S')
A value is trying to be set on a copy o

0:00:04.905862


In [5]:
### Copy and rename in batches, based on renaming table
start = datetime.datetime.now()

unique_directories = set(renaming_table['Dest_Directory'])
for d in unique_directories:
    if not os.path.exists(d):
        os.makedirs(d)

copy_images_batch(renaming_table)

end = datetime.datetime.now()
print(end - start)

NameError: name 'renaming_table' is not defined

In [None]:
### Copy and rename based on renaming table
start = datetime.datetime.now()

unique_directories = set(renaming_table['Dest_Directory'])
for d in unique_directories:
    if not os.path.exists(d):
        os.makedirs(d)

copy_images(renaming_table)

end = datetime.datetime.now()
print(end - start)

First 1000 images copied at 2023-10-20 10:16:18.606086
First 2000 images copied at 2023-10-20 10:16:27.493457
First 3000 images copied at 2023-10-20 10:16:41.013794
First 4000 images copied at 2023-10-20 10:16:59.139294
First 5000 images copied at 2023-10-20 10:17:24.540477
First 6000 images copied at 2023-10-20 10:17:53.117812
First 7000 images copied at 2023-10-20 10:18:27.001459
First 8000 images copied at 2023-10-20 10:19:08.088751
First 9000 images copied at 2023-10-20 10:19:49.303451
First 10000 images copied at 2023-10-20 10:20:37.117296
First 11000 images copied at 2023-10-20 10:20:41.648107
First 12000 images copied at 2023-10-20 10:20:50.868103
First 13000 images copied at 2023-10-20 10:21:05.305992
First 14000 images copied at 2023-10-20 10:21:24.033877
First 15000 images copied at 2023-10-20 10:21:45.849361
First 16000 images copied at 2023-10-20 10:22:12.729959
First 17000 images copied at 2023-10-20 10:22:44.192840
First 18000 images copied at 2023-10-20 10:23:20.188728
F

In [6]:
#### Only renaming code
start = datetime.datetime.now()

renaming_table.apply(rename_images, axis=1)

end = datetime.datetime.now()
print(end - start)

NameError: name 'renaming_table' is not defined