# Sandbox Notebook

Experimental notebook for testing `home_media` package functionality.

Auto-reload is enabled to pick up changes to the package without restarting the kernel.

In [7]:
# Enable auto-reload for the home_media package
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# Imports
import sys
from pathlib import Path
import yaml
import pandas as pd
import datetime

# Add parent directory to path so we can import home_media
sys.path.insert(0, str(Path.cwd().parent))

## Load Configuration

In [9]:
# Load config from config.yaml
config_path = Path.cwd().parent / "config.yaml"

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"photos_root_original: {config['photos_root_original']}")

Configuration loaded:
photos_root_original: \\tiger\photo\RAW


## List Subdirectories in photos_root_original

In [10]:
# Get the photos root directory
photos_root = Path(config['photos_root_original'])

print(f"Listing subdirectories in: {photos_root}\n")

# Check if path exists
if not photos_root.exists():
    print(f"ERROR: Path does not exist: {photos_root}")
elif not photos_root.is_dir():
    print(f"ERROR: Path is not a directory: {photos_root}")
else:
    # List all subdirectories
    subdirs = [d for d in photos_root.iterdir() if d.is_dir()]
    
    print(f"Found {len(subdirs)} subdirectories:\n")
    
    for subdir in sorted(subdirs):
        print(f"  üìÅ {subdir.name}")

Listing subdirectories in: \\tiger\photo\RAW

Found 29 subdirectories:

  üìÅ 1979
  üìÅ 1988
  üìÅ 2000
  üìÅ 2003
  üìÅ 2004
  üìÅ 2005
  üìÅ 2006
  üìÅ 2007
  üìÅ 2008
  üìÅ 2009
  üìÅ 2010
  üìÅ 2011
  üìÅ 2012
  üìÅ 2013
  üìÅ 2014
  üìÅ 2015
  üìÅ 2016
  üìÅ 2017
  üìÅ 2018
  üìÅ 2019
  üìÅ 2020
  üìÅ 2021
  üìÅ 2022
  üìÅ 2023
  üìÅ 2024
  üìÅ 2025
  üìÅ advantix
  üìÅ advantix copy
  üìÅ scanned


In [11]:
# Look at files in a specific subdirectory
subdir = "2025/01/01"
target_dir = photos_root / subdir

print(f"Examining directory: {target_dir}")
print(f"Subdirectory (relative): {subdir}")

if target_dir.exists() and target_dir.is_dir():
    file_list = [f for f in target_dir.iterdir() if f.is_file()]
    print(f"Number of files: {len(file_list)}")
    
    # Show first 5 files as examples
    print(f"\nFirst 5 files:")
    for f in file_list[:5]:
        print(f"  üìÑ {f.name}")
else:
    print(f"ERROR: Directory does not exist or is not accessible")

# Create a list to hold file metadata
file_data = []

if target_dir.exists() and target_dir.is_dir():
    for file_path in target_dir.iterdir():
        if file_path.is_file():
            # Get file stats
            stats = file_path.stat()
            
            # Get relative subdirectory path from photos_root_original
            rel_subdir = file_path.parent.relative_to(photos_root)
            
            file_data.append({
                'filename': file_path.name,
                'extension': file_path.suffix.lower(),
                'subdirectory': str(rel_subdir),
                'created': datetime.datetime.fromtimestamp(stats.st_ctime),
                'modified': datetime.datetime.fromtimestamp(stats.st_mtime),
                'size_bytes': stats.st_size
            })

# Create DataFrame
df = pd.DataFrame(file_data)

print(f"\nCreated DataFrame with {len(df)} files")
print(f"\nFirst 10 rows:")
df.head(10)

Examining directory: \\tiger\photo\RAW\2025\01\01
Subdirectory (relative): 2025/01/01
Number of files: 220

First 5 files:
  üìÑ 2025-01-01_00-28-40.jpg
  üìÑ 2025-01-01_00-28-40.jpg.xmp
  üìÑ 2025-01-01_00-28-40_001.jpg
  üìÑ 2025-01-01_00-28-40_001.jpg.xmp
  üìÑ 2025-01-01_00-28-40_01.jpg.xmp

Created DataFrame with 220 files

First 10 rows:


Unnamed: 0,filename,extension,subdirectory,created,modified,size_bytes
0,2025-01-01_00-28-40.jpg,.jpg,2025\01\01,2025-01-01 00:29:28.492810,2025-01-01 00:29:24.000000,1884153
1,2025-01-01_00-28-40.jpg.xmp,.xmp,2025\01\01,2025-09-04 14:52:42.438514,2025-09-05 07:04:46.686699,1383
2,2025-01-01_00-28-40_001.jpg,.jpg,2025\01\01,2025-05-02 12:09:18.121342,2025-01-01 00:29:24.000000,1884153
3,2025-01-01_00-28-40_001.jpg.xmp,.xmp,2025\01\01,2025-09-04 14:52:42.495514,2025-09-05 07:04:46.827275,1387
4,2025-01-01_00-28-40_01.jpg.xmp,.xmp,2025\01\01,2025-09-05 07:04:46.434260,2025-09-05 07:04:46.436888,1383
5,2025-01-01_00-28-55.jpg,.jpg,2025\01\01,2025-01-01 00:29:28.365809,2025-01-01 00:29:24.000000,1099626
6,2025-01-01_00-28-55.jpg.xmp,.xmp,2025\01\01,2025-09-04 14:52:42.566514,2025-09-05 07:04:46.928044,1383
7,2025-01-01_00-28-55_001.jpg,.jpg,2025\01\01,2025-05-02 12:09:15.869327,2025-01-01 00:29:24.000000,1099626
8,2025-01-01_00-28-55_001.jpg.xmp,.xmp,2025\01\01,2025-09-04 14:52:42.615515,2025-09-05 07:04:46.939296,1387
9,2025-01-01_00-28-55_01.jpg.xmp,.xmp,2025\01\01,2025-09-05 07:04:46.870264,2025-09-05 07:04:46.872339,1383


## Analyze Filename Patterns

In [12]:
# Analyze filename patterns to find common prefixes
from collections import defaultdict

# Group files by their base name (without extension)
file_groups = defaultdict(list)

if target_dir.exists() and target_dir.is_dir():
    for file_path in target_dir.iterdir():
        if file_path.is_file():
            # Get the stem (filename without extension)
            base_name = file_path.stem
            file_groups[base_name].append(file_path)

# Show some examples of grouped files
print(f"Total unique base names: {len(file_groups)}\n")
print("Examples of files with same base name:\n")

count = 0
for base_name, files in sorted(file_groups.items()):
    if len(files) > 1:  # Only show groups with multiple files
        print(f"{base_name}:")
        for f in files:
            print(f"  - {f.name}")
        print()
        count += 1
        if count >= 5:  # Show first 5 examples
            break

# Count how many base names have multiple files
multi_file_groups = {k: v for k, v in file_groups.items() if len(v) > 1}
print(f"\nBase names with multiple files: {len(multi_file_groups)}")
print(f"Base names with single file: {len(file_groups) - len(multi_file_groups)}")

Total unique base names: 182

Examples of files with same base name:

2025-01-01_10-22-31:
  - 2025-01-01_10-22-31.dng
  - 2025-01-01_10-22-31.jpg

2025-01-01_10-22-33:
  - 2025-01-01_10-22-33.dng
  - 2025-01-01_10-22-33.jpg

2025-01-01_11-01-07:
  - 2025-01-01_11-01-07.dng
  - 2025-01-01_11-01-07.jpg

2025-01-01_11-01-10:
  - 2025-01-01_11-01-10.dng
  - 2025-01-01_11-01-10.jpg

2025-01-01_11-01-12:
  - 2025-01-01_11-01-12.dng
  - 2025-01-01_11-01-12.jpg


Base names with multiple files: 38
Base names with single file: 144


## Create Grouped DataFrame (One Row Per Base Name)

In [13]:
# Create DataFrame with one row per base name (grouped files)
grouped_data = []

for base_name, files in sorted(file_groups.items()):
    # Get all extensions for this base name
    extensions = [f.suffix.lower() for f in files]
    
    # Get the earliest created and latest modified dates across all variants
    created_dates = [datetime.datetime.fromtimestamp(f.stat().st_ctime) for f in files]
    modified_dates = [datetime.datetime.fromtimestamp(f.stat().st_mtime) for f in files]
    
    # Get total size of all variants
    total_size = sum(f.stat().st_size for f in files)
    
    # Get relative subdirectory (same for all files in group)
    rel_subdir = files[0].parent.relative_to(photos_root)
    
    grouped_data.append({
        'base_name': base_name,
        'file_count': len(files),
        'extensions': ', '.join(sorted(extensions)),
        'subdirectory': str(rel_subdir),
        'created': min(created_dates),
        'modified': max(modified_dates),
        'total_size_bytes': total_size
    })

# Create grouped DataFrame
df_grouped = pd.DataFrame(grouped_data)

print(f"Created grouped DataFrame with {len(df_grouped)} unique base names")
print(f"\nFirst 10 rows:")
df_grouped.head(10)

Created grouped DataFrame with 182 unique base names

First 10 rows:


Unnamed: 0,base_name,file_count,extensions,subdirectory,created,modified,total_size_bytes
0,2025-01-01_00-28-40,1,.jpg,2025\01\01,2025-01-01 00:29:28.492810,2025-01-01 00:29:24.000000,1884153
1,2025-01-01_00-28-40.jpg,1,.xmp,2025\01\01,2025-09-04 14:52:42.438514,2025-09-05 07:04:46.686699,1383
2,2025-01-01_00-28-40_001,1,.jpg,2025\01\01,2025-05-02 12:09:18.121342,2025-01-01 00:29:24.000000,1884153
3,2025-01-01_00-28-40_001.jpg,1,.xmp,2025\01\01,2025-09-04 14:52:42.495514,2025-09-05 07:04:46.827275,1387
4,2025-01-01_00-28-40_01.jpg,1,.xmp,2025\01\01,2025-09-05 07:04:46.434260,2025-09-05 07:04:46.436888,1383
5,2025-01-01_00-28-55,1,.jpg,2025\01\01,2025-01-01 00:29:28.365809,2025-01-01 00:29:24.000000,1099626
6,2025-01-01_00-28-55.jpg,1,.xmp,2025\01\01,2025-09-04 14:52:42.566514,2025-09-05 07:04:46.928044,1383
7,2025-01-01_00-28-55_001,1,.jpg,2025\01\01,2025-05-02 12:09:15.869327,2025-01-01 00:29:24.000000,1099626
8,2025-01-01_00-28-55_001.jpg,1,.xmp,2025\01\01,2025-09-04 14:52:42.615515,2025-09-05 07:04:46.939296,1387
9,2025-01-01_00-28-55_01.jpg,1,.xmp,2025\01\01,2025-09-05 07:04:46.870264,2025-09-05 07:04:46.872339,1383


## One Row Per Image with Suffixes

In [15]:
# Create a more sophisticated grouping that handles suffixes before extensions
# For example: "2025-01-01_00-28-40.jpg" and "2025-01-01_00-28-40_001.jpg"
# should be grouped under base "2025-01-01_00-28-40"

from collections import defaultdict
import re

image_groups = defaultdict(list)

if target_dir.exists() and target_dir.is_dir():
    for file_path in target_dir.iterdir():
        if file_path.is_file():
            filename = file_path.name

            # Extract the base name by removing extension and any suffix pattern
            # Pattern: basename[_suffix].extension[.extension]
            # Example: "2025-01-01_00-28-40_001.jpg.xmp" -> base: "2025-01-01_00-28-40"

            # First, remove all extensions (handle .jpg.xmp cases)
            name_without_ext = filename
            while '.' in name_without_ext:
                name_without_ext = Path(name_without_ext).stem

            # Check if there's a suffix pattern like _001, _002, etc.
            # Look for underscore followed by digits at the end
            match = re.match(r'^(.+?)(_\d+)?$', name_without_ext)
            base_name = match[1] if match else name_without_ext
            # Calculate the suffix (everything after base_name)
            suffix = filename[len(base_name):]

            image_groups[base_name].append({
                'file_path': file_path,
                'suffix': suffix
            })

# Create DataFrame with one row per image
image_data = []

for base_name, files_info in sorted(image_groups.items()):
    # Get all suffixes
    suffixes = [info['suffix'] for info in files_info]

    # Get file paths
    file_paths = [info['file_path'] for info in files_info]

    # Get relative subdirectory
    rel_subdir = file_paths[0].parent.relative_to(photos_root)

    image_data.append({
        'base_name': base_name,
        'suffixes': suffixes,
        'file_count': len(suffixes),
        'subdirectory': str(rel_subdir)
    })

# Create DataFrame
df_images = pd.DataFrame(image_data)

print(f"Created image DataFrame with {len(df_images)} unique images")
print(f"\nFirst 10 rows:")
df_images.head(10)

Created image DataFrame with 49 unique images

First 10 rows:


Unnamed: 0,base_name,suffixes,file_count,subdirectory
0,2025-01-01_00-28-40,"[.jpg.xmp, _01.jpg.xmp, _001.jpg, _001.jpg.xmp...",5,2025\01\01
1,2025-01-01_00-28-55,"[.jpg.xmp, .jpg, _001.jpg, _001.jpg.xmp, _01.j...",5,2025\01\01
2,2025-01-01_00-28-57,"[_01.jpg.xmp, .jpg, _001.jpg.xmp, .jpg.xmp, _0...",5,2025\01\01
3,2025-01-01_10-22-31,"[.dng.xmp, .jpg, .dng, .jpg.xmp]",4,2025\01\01
4,2025-01-01_10-22-33,"[.jpg, .dng.xmp, .jpg.xmp, .dng]",4,2025\01\01
5,2025-01-01_11-01-07,"[.dng, .jpg, .jpg.xmp, .dng.xmp]",4,2025\01\01
6,2025-01-01_11-01-10,"[.jpg.xmp, .dng, .dng.xmp, .jpg]",4,2025\01\01
7,2025-01-01_11-01-12,"[.dng, .jpg.xmp, .jpg, .dng.xmp]",4,2025\01\01
8,2025-01-01_11-01-13,"[.dng.xmp, .jpg.xmp, .dng, .jpg]",4,2025\01\01
9,2025-01-01_11-06-48,"[.jpg, _01.jpg.xmp, _001.jpg.xmp, _001.jpg, .j...",5,2025\01\01
