# Files Controleren met PathLib

In [6]:
from datetime import datetime
from pathlib import Path
import re
import hashlib

import pandas as pd

# from analyze_files import recursive_walk   # recursive_walk()
# import analyze_files                       # analyze_files.recursive_walk()
# from analyze_files import *                # recursive_walk()

In [21]:
def recursive_walk(directory: str, glob_pattern: str = None, re_pattern: str = None):
    """Recursively travels through directory and it's subdirectories.
    
    directory: directory to search
    glob_pattern: (optional) file matching string with wild cards
    re_pattern: (optional) file template string with regular expression

    The function is a generator returning a dict for each file.

    The function gracefully exits on KeyboardInterrupt (Control-C)
    
    Examples:
    
    >>> all_files = list(recursive_walk('..'))
    
    >>> all_files = list(recursive_walk('..', glob_pattern = '*.csv'))

    >>> all_files = list(recursive_walk('..', re_pattern = '.*ca-.*'))
    """

    try:
        path = Path(directory)
    
        if re_pattern:
            regex = re.compile(re_pattern)
        
        for i, (root, dirs, files) in enumerate(path.walk()):
            print(i, end='\r')
            dirs[:] = [d for d in dirs if not d.startswith('.')]
            files[:] = [f for f in files if f[0] not in ".~"]
            if glob_pattern:
                files[:] = [f.name for f in root.glob(glob_pattern)]
            if re_pattern:
                files[:] = [f for f in files if regex.match(f)]
            
            # print(f"Directory path: {root}")
            # print(f"Directory Names: {dirs}")
            # print(f"Files Names: {files}")
            # print(80 * '-')
    
            for filename in files:
                file_path = root / Path(filename)
                                
                # info = Path(file_path)
                owner = file_path.owner()
                group = file_path.group()
    
                # Get file stats
                stat = file_path.stat()
                
                # File size in bytes
                size = stat.st_size
                
                # Creation time (platform dependent)
                ctime = datetime.fromtimestamp(stat.st_ctime)  # On Unix: metadata change time, on Windows: creation time
                
                # Modification time
                mtime = datetime.fromtimestamp(stat.st_mtime)
                
                # Last Access time
                atime = datetime.fromtimestamp(stat.st_atime)

                try:
                    with open(file_path, mode = 'rb') as f:
                        md5 = hashlib.md5(f.read()).hexdigest()
                except KeyboardInterrupt as ex:
                    raise(ex)
                except:
                    md5 = None
        
                yield {
                    "filename": filename,
                    "directory": str(root),
                    "size": size,
                    "owner": owner,
                    "group": group,
                    "created": ctime,
                    "modified": mtime,          
                    "accessed": atime,
                    "md5": md5,
                }

    except KeyboardInterrupt:
        return

### Do the walk

In [24]:
directory = '/Users/peter'

glob_pattern = ''    # or e.g. '*.csv'
re_pattern = ''      # or e.g. '.*ca-.*'

In [25]:
# all_files = list(recursive_walk(directory))

all_files = list(recursive_walk(directory, 
                                glob_pattern = glob_pattern, 
                                re_pattern = re_pattern))

163

In [20]:
len(all_files)

170158

### Put results in a Pandas DataFrame

In [11]:
df = pd.DataFrame(all_files)

In [12]:
df.head(30)

Unnamed: 0,filename,directory,size,owner,group,created,modified,accessed,md5
0,OneDrive,/Users/peter,352,peter,staff,2025-11-16 14:27:22.705283,2025-09-06 22:48:54.109681,2025-09-06 22:48:54.109854,
1,attributed_ports.geojson,/Users/peter/Femke,236214,peter,staff,2025-02-05 08:49:07.554563,2023-10-15 21:27:20.425996,2023-10-15 21:27:20.439581,13887acf8e0817a39df5f1b4e45274a9
2,ports_distance.py,/Users/peter/Femke,2021,peter,staff,2024-01-26 13:57:49.827479,2023-11-09 23:49:11.322718,2025-11-13 17:07:11.950977,47f91df47de9b499692457ada09b8673
3,port_distances.pickle,/Users/peter/Femke,2293775612,peter,staff,2023-11-11 17:53:00.462749,2023-11-11 17:53:00.462749,2023-12-16 12:26:06.066918,78946eab414a0d8b757c28faf440c816
4,portdata.csv,/Users/peter/Femke/data,36994,peter,staff,2023-10-13 09:55:12.230644,2023-09-22 10:08:20.000000,2023-11-04 11:54:03.963699,704d33f6dfde84c0e349865159eefe93
5,sailing_trajectories.csv,/Users/peter/Femke/data,4851522,peter,staff,2023-10-13 12:12:09.830781,2023-10-13 12:12:09.830781,2023-11-04 11:54:04.116680,6308396297528177de316a0e4741967d
6,port_turnarounds.csv,/Users/peter/Femke/data,4171303,peter,staff,2023-10-13 12:09:42.309564,2023-10-13 12:09:42.309564,2023-11-04 11:54:03.818949,a0db23acc87055a32c42f02e29042018
7,Hist_port_calls.csv,/Users/peter/Femke/data,11299864,peter,staff,2023-10-13 09:55:12.228010,2023-09-25 15:55:31.000000,2023-11-04 11:54:03.674104,73dbb9914944beeb6e56f4d9037ba613
8,20231010 email.md,/Users/peter/Femke/downloads/2023-10-11,1014,peter,staff,2023-10-25 21:56:26.769208,2023-10-11 08:01:11.155213,2023-10-25 21:56:28.118254,c0c45fe11b9d0d9820dac0edd798d799
9,model-goals.pdf,/Users/peter/Femke/downloads/2023-10-11,97528,peter,staff,2024-11-03 21:45:57.326529,2023-10-11 07:58:54.469511,2025-01-06 23:34:23.423592,0e5a81036a0b063931569eca5c3461e2


### Find duplicated files base on 'size'

In [13]:
subset = ['size']
duplicates = df.duplicated(subset=subset, keep=False)
df.loc[duplicates, 'duplicated'] = df[subset].apply(tuple, axis=1)

In [16]:
df[df['duplicated'].notna()].query('size>10').sort_values('duplicated')

Unnamed: 0,filename,directory,size,owner,group,created,modified,accessed,md5,duplicated
1343,61756d75-6d73796e-6170706c.tagset,/Users/peter/Music/Audio Music Apps/Databases/...,181,peter,staff,2022-01-03 11:08:04.290129,2021-11-01 22:03:29.167331,2021-11-01 22:03:29.323484,2de6a04cdba79ed13580c47dfd70cc5f,"(181,)"
1359,61756678-72616163-6170706c.tagset,/Users/peter/Music/Audio Music Apps/Databases/...,181,peter,staff,2022-01-03 11:08:04.290805,2021-11-01 22:03:29.168562,2021-11-01 22:03:29.387890,2de6a04cdba79ed13580c47dfd70cc5f,"(181,)"
1339,61756678-62706173-6170706c.tagset,/Users/peter/Music/Audio Music Apps/Databases/...,181,peter,staff,2022-01-03 11:08:04.289961,2021-11-01 22:03:29.170906,2021-11-01 22:03:29.349865,2de6a04cdba79ed13580c47dfd70cc5f,"(181,)"
1417,61756678-6e626571-6170706c.tagset,/Users/peter/Music/Audio Music Apps/Databases/...,181,peter,staff,2022-01-03 11:08:04.291614,2021-11-01 22:03:29.169127,2021-11-01 22:03:29.333081,2de6a04cdba79ed13580c47dfd70cc5f,"(181,)"
1426,61756d75-4e694b38-2d4e492d.tagset,/Users/peter/Music/Audio Music Apps/Databases/...,181,peter,staff,2025-09-08 19:08:39.909319,2025-09-08 19:08:39.909319,2025-09-08 19:08:40.154402,2de6a04cdba79ed13580c47dfd70cc5f,"(181,)"
...,...,...,...,...,...,...,...,...,...,...
1084,incl_fleet_type.csv,/Users/peter/Femke/modelFemke/data/preprocessed,45519,peter,staff,2023-10-26 11:28:50.374017,2023-10-25 21:46:55.579916,2023-11-09 23:50:30.183799,ee273a1dad21fb15ccc4bc742c33f0f3,"(45519,)"
1085,shipdata_merge.csv,/Users/peter/Femke/modelFemke/data/preprocessed,241419,peter,staff,2023-10-31 23:10:48.662755,2023-10-26 12:09:49.313984,2023-11-09 23:50:30.322719,26ddc3a980f34e94fe0a345e07a59cf8,"(241419,)"
973,shipdata_merge.csv,/Users/peter/Femke/modelFemke/data/ships,241419,peter,staff,2023-10-26 11:28:57.306657,2023-10-25 23:29:17.845937,2023-11-09 23:50:34.788298,26ddc3a980f34e94fe0a345e07a59cf8,"(241419,)"
1093,Hist_port_calls.csv,/Users/peter/Femke/modelFemke/data/raw,11299864,peter,staff,2023-10-26 11:48:14.261748,2023-09-25 15:55:31.000000,2023-11-09 23:50:32.731198,73dbb9914944beeb6e56f4d9037ba613,"(11299864,)"


### Validate filename format is YYYYMMDD_NU_filename.ext (NU, NR or NS)

In [10]:
regex = r'(\d{8})_(\w{2})_.*'

df[['filename_dd', 'filename_classificatie']] = df['filename'].str.extract(regex)

df['filename_ok'] = df['filename_dd'].notna() & \
                    pd.to_datetime(df['filename_dd'], format='%Y%m%d', errors='coerce').notna() & \
                    df['filename_classificatie'].notna() & \
                    df['filename_classificatie'].isin(['NU', 'NR', 'NS'])

In [11]:
df

Unnamed: 0,filename,directory,size,owner,group,created,modified,accessed,md5,duplicated,filename_dd,filename_classificatie,filename_ok
0,OneDrive,/Users/peter,352,peter,staff,2025-11-16 14:27:22.705283,2025-09-06 22:48:54.109681,2025-09-06 22:48:54.109854,,"(352,)",,,False
1,attributed_ports.geojson,/Users/peter/Femke,236214,peter,staff,2025-02-05 08:49:07.554563,2023-10-15 21:27:20.425996,2023-10-15 21:27:20.439581,13887acf8e0817a39df5f1b4e45274a9,,,,False
2,ports_distance.py,/Users/peter/Femke,2021,peter,staff,2024-01-26 13:57:49.827479,2023-11-09 23:49:11.322718,2025-11-13 17:07:11.950977,47f91df47de9b499692457ada09b8673,,,,False
3,port_distances.pickle,/Users/peter/Femke,2293775612,peter,staff,2023-11-11 17:53:00.462749,2023-11-11 17:53:00.462749,2023-12-16 12:26:06.066918,78946eab414a0d8b757c28faf440c816,,,,False
4,portdata.csv,/Users/peter/Femke/data,36994,peter,staff,2023-10-13 09:55:12.230644,2023-09-22 10:08:20.000000,2023-11-04 11:54:03.963699,704d33f6dfde84c0e349865159eefe93,"(36994,)",,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4230,Index.zip,/Users/peter/_Privé/Mark/CV Mark.pages,47787,peter,staff,2022-01-03 11:59:42.649269,2021-09-12 13:33:40.000000,2021-11-04 14:16:53.311504,200b954d81ed173d35f3339892d3bed6,,,,False
4231,Hardcover_bullet_black-13.png,/Users/peter/_Privé/Mark/CV Mark.pages/Data,7007,peter,staff,2022-01-03 11:59:42.649368,2021-09-12 13:33:40.000000,2021-11-04 14:16:53.304724,75ba0de08067f948e87c44a4960cb4e4,,,,False
4232,DocumentIdentifier,/Users/peter/_Privé/Mark/CV Mark.pages/Metadata,36,peter,staff,2022-01-03 11:59:42.649565,2021-09-12 13:33:40.000000,2021-11-04 14:16:53.313867,9d45a9a352d6e90140cd39110fb08a19,,,,False
4233,BuildVersionHistory.plist,/Users/peter/_Privé/Mark/CV Mark.pages/Metadata,220,peter,staff,2022-01-03 11:59:42.649604,2021-09-12 13:33:40.000000,2021-11-04 14:16:53.313024,3bed317941b73bccaffabf9c1d8045e4,,,,False
