Calculate hashes of all files.

Caches calulated hashes in a JSON file.

In [None]:
import glob
import os
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import hashlib
from pathlib import Path, PurePath, PurePosixPath

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
os.chdir('M:\\')

In [None]:
%%time
file_listing = []
for filename in glob.iglob('**', recursive=True):
    path = Path(filename)
    if path.is_file():
        filename = path.as_posix()
        stat = path.stat()
        filesize = stat.st_size
        mtime = int(stat.st_mtime)
        #print((filename, filesize, mtime))
        file_listing += [(filename, filesize, mtime)]

In [None]:
len(file_listing)

In [None]:
file_listing_df = pd.DataFrame(file_listing, columns=['filename', 'filesize', 'mtime'])
file_listing_df
# file_listing_df = file_listing_df.set_index(['filename'])

In [None]:
file_listing_df['ext'] = file_listing_df.filename.apply(lambda f: os.path.splitext(f)[1].lower())
file_listing_df

In [None]:
exts = [
    '.avi',
    '.bmp',
    '.gif',
    '.jpeg',
    '.jpg',
    '.m4a',
    '.m4p',
    '.m4v',
    '.mov',
    '.mp2',
    '.mp3',
    '.mp4',
    '.pcd',
    '.png',
    '.tif',
    '.vob',
    '.wma',
    '.wmf',
]

In [None]:
file_listing_df = file_listing_df[file_listing_df.ext.isin(exts)]
file_listing_df

In [None]:
file_hash_cache_df = pd.read_json('file_hash_cache.json', orient='records', lines=True)
file_hash_cache_df = file_hash_cache_df.set_index(['filename'])

In [None]:
with_cache_df = file_listing_df.set_index(['filename']).join(file_hash_cache_df, how='left', rsuffix='_cached')
with_cache_df

In [None]:
need_hash_df = with_cache_df[with_cache_df.hash.isna()].copy()
need_hash_df

In [None]:
with_cache_df.filesize.sum() * 1e-9

In [None]:
need_hash_df.filesize.sum() * 1e-9

In [None]:
def calc_file_hash(filename):
    file_hash = hashlib.sha256()
    with open(filename, 'rb') as f:
        while True:
            chunk = f.read(64*1024)
            if not chunk: break
            file_hash.update(chunk)
    return file_hash.hexdigest()

In [None]:
%%time
need_hash_df['hash'] = Parallel(n_jobs=-1)(delayed(calc_file_hash)(filename) for filename in need_hash_df.index.values)

In [None]:
need_hash_df

In [None]:
with_hash_df = with_cache_df.join(need_hash_df[['hash']], how='left', lsuffix='_cached')
with_hash_df

In [None]:
with_hash_df.loc[with_hash_df.hash.isna(), 'hash'] = with_hash_df.loc[with_hash_df.hash.isna(), 'hash_cached']

In [None]:
with_hash_df = with_hash_df[['filesize','mtime','hash']].reset_index()
with_hash_df

In [None]:
with_hash_df.to_json('file_hash_cache.json', orient='records', lines=True)