Using file hashes, find duplicate files and delete them.

Uses rules to determine the copy to keep.

Creates Windows shortcuts from deleted files to kept files.

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path, PurePath, PurePosixPath

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
os.chdir('M:\\')

In [None]:
file_hash_df = pd.read_json('file_hash_cache.json', orient='records', lines=True)
file_hash_df

In [None]:
hash_dict = file_hash_df.groupby(['hash'])['filename'].apply(list).to_dict()

In [None]:
dup_hash_dict = {k:v for k,v in hash_dict.items() if len(v) > 1}

In [None]:
keep_dirs = [
    'Erika/',
]
delete_dirs = [
    'iCloud Photos/Downloads/',
    'iPhone',
    'Camera Uploads/',
    'Photo Frame/',
    'sort/', 'Sort/',
    'misc/', 'Misc/',
    'APPLE',
    'To print/', 'To Print/',
    'My Music/',
    'wetransfer',
]

In [None]:
def score_file_to_delete(filename):
    """
    The file with the lowest score will be kept (not deleted).
    Prefer to delete files automatically downloaded from iCloud Photos.
    Prefer to delete files with IMG in name.
    Prefer to keep files with longest name.
    Keep only 1 file. Delete all others.
    """
    score = []
    for d in keep_dirs:
        score += [-int(d in filename)]
    for d in delete_dirs:
        score += [int(d in filename)]
    basename = os.path.basename(filename)
    score += [int('img' in basename.lower())]
    score += [-len(basename)]
    score += [-len(filename)]
    return tuple(score)

In [None]:
def score_files_to_delete(filenames):
    return [(score_file_to_delete(f), f) for f in filenames]    

In [None]:
def rank_files_to_delete(scores):
    return [s[1] for s in sorted(scores)]

In [None]:
test_info = [
    ['Erika/Pictures/Photos/2016-05-22/IMG_3014.PNG','Erika/Pictures/Photos/103APPLE/IMG_3014.PNG'],
    ['Claudio/Claudio Photos/2016-07/IMG_6044.JPG','Claudio/Claudio Photos/Claudio’s iPhone 5S white 2016-06-05 to 2018-02-11/2016-07-02_224949640_F9589_iOS.jpg'],
    ['Erika/Pictures/Photos/2015-07-12/IMG_2507.JPG','Erika/Pictures/Photos/2016-05-22/IMG_2507.JPG'],
    ['Erika/Pictures/Photos/2015-07-12/IMG_2507.JPG','Erika/Pictures/iCloud Photos/Downloads/2015/IMG_2507.JPG'],
    ['Erika/Pictures/Photos/113APPLE/IMG_3191.JPG','Erika/Pictures/iCloud Photos/Downloads/2017/IMG_3191.JPG'],
    ['Erika/Pictures/Photos/Photo Frame/101.jpg','Claudio/Claudio Photos/Claudio-Erika2.jpg'],
    ['Erika/Pictures/Photos/Alex-2015-05-30/IMG_9747.JPG','Erika/Pictures/Photos/Camera Uploads/2015-05-30 15.13.35.jpg'],
]
for t in test_info:
    scores = score_files_to_delete(t)
    ranked = rank_files_to_delete(scores)
    assert ranked[0] == t[0], 'failed: ' + str(t) + ', scores=' + str(scores)

In [None]:
scored_dict = {k: rank_files_to_delete(score_files_to_delete(v)) for k,v in dup_hash_dict.items()}

In [None]:
dup_list = [(hash, filenames[0], filename) for hash, filenames in scored_dict.items() for filename in filenames[1:]]

In [None]:
dup_list[:5]

In [None]:
filename_df = pd.DataFrame(dup_list, columns=['hash', 'keep_filename', 'delete_filename'])
filename_df = filename_df.sort_values(['keep_filename','delete_filename'])
filename_df

In [None]:
filename_df.to_csv('Claudio/find-duplicates/delete_filename.csv')

In [None]:
dir_df = filename_df.copy()
dir_df['keep_dirname'] = dir_df['keep_filename'].apply(os.path.dirname)
dir_df['delete_dirname'] = dir_df['delete_filename'].apply(os.path.dirname)
dir_df = dir_df[['keep_dirname','delete_dirname']].drop_duplicates().sort_values(['keep_dirname','delete_dirname']).reset_index(drop=True)
dir_df

In [None]:
delete_dirname_df = dir_df
#delete_dirname_df = pd.DataFrame(dir_df.groupby(['keep_dirname'])['delete_dirname'].apply(list).apply(lambda x: ','.join(x)).sort_index())
#delete_dirname_df

In [None]:
delete_dirname_df.to_csv('Claudio/find-duplicates/delete_dirname.csv')

## Create shortcuts to replace deleted files

In [None]:
!pip install winshell

In [None]:
import winshell

In [None]:
%%time
root_dir = '\\\\WinFileSrv\\slowdata1'
for index, row in filename_df.iterrows():
    target_filename = os.path.realpath(os.path.join(root_dir, row.keep_filename))
    shortcut_filename = os.path.realpath(row.delete_filename) + '.lnk'
    #print(shortcut_filename, target_filename)
    assert Path(target_filename).is_file()
    with winshell.shortcut(shortcut_filename) as shortcut:
          shortcut.path = target_filename

In [None]:
## Delete shortcuts

In [None]:
%%time
if False:
    for index, row in filename_df.iterrows():
        shortcut_filename = os.path.realpath(row.delete_filename) + '.lnk'
        os.remove(shortcut_filename)

## Delete duplicate files

In [None]:
%%time
for index, row in filename_df.reset_index().iterrows():
    if index % 1000 == 0: print(row.delete_filename)
    if Path(row.delete_filename).is_file():
        try:
            os.remove(row.delete_filename)
        except Exception as ex:
            print(ex)