In [43]:
import pandas as pd
import numpy as np


In [44]:

df = pd.read_parquet('lightroom_edits.parquet')

In [45]:
# identify images with no edits

constant_columns = []
for col in df.columns:
    if df[col].nunique(dropna=False) == 1:
        constant_columns.append(col)

print("Columns where every row is the same:", constant_columns)

# drop constant columns
df = df.drop(columns=constant_columns)

# drop rows where all columns are NaN
#df = df.dropna(axis=0, how='all')




Columns where every row is the same: ['ColorGradeGlobalHue', 'ColorGradeGlobalSat', 'ColorGradeGlobalLum', 'ColorGradeShadowHue', 'ColorGradeShadowSat', 'ColorGradeHighlightHue', 'ColorGradeHighlightSat']


In [46]:
# Print out the number of images with edits for each particular column
edit_counts = []
for col in df.columns:
    num_edits = df[col].notnull().sum()
    edit_counts.append((col, num_edits))

# Sort ascending by num_edits
edit_counts.sort(key=lambda x: x[1])

for col, num_edits in edit_counts:
    print(f"{col}: {num_edits} images with edits")


RedSaturation: 1 images with edits
GreenSaturation: 1 images with edits
BlueSaturation: 1 images with edits
HueAdjustmentAqua: 2 images with edits
rating: 5 images with edits
LuminanceAdjustmentPurple: 10 images with edits
LuminanceAdjustmentMagenta: 18 images with edits
SaturationAdjustmentAqua: 56 images with edits
HueAdjustmentPurple: 165 images with edits
HueAdjustmentGreen: 180 images with edits
HueAdjustmentYellow: 185 images with edits
SaturationAdjustmentGreen: 189 images with edits
SaturationAdjustmentMagenta: 190 images with edits
SaturationAdjustmentPurple: 197 images with edits
LuminanceAdjustmentAqua: 207 images with edits
LuminanceAdjustmentBlue: 210 images with edits
SaturationAdjustmentYellow: 212 images with edits
BlueHue: 280 images with edits
SaturationAdjustmentBlue: 283 images with edits
HueAdjustmentMagenta: 304 images with edits
HueAdjustmentOrange: 341 images with edits
SaturationAdjustmentOrange: 496 images with edits
SaturationAdjustmentRed: 2025 images with e

In [47]:
# name the number images which have edits to a particular column

edit_counts = []
for col in df.columns:
    # Count number of images (rows) with a non-null (not NaN) and non-zero value in this column
    num_edits = ((df[col].notnull()) & (df[col] != 0)).sum()
    edit_counts.append((col, num_edits))

# Sort descending by num_edits
edit_counts.sort(key=lambda x: x[1], reverse=False)

for col, num_edits in edit_counts:
    print(f"{col}: {num_edits} images with edits (not null and not zero)")


RedSaturation: 1 images with edits (not null and not zero)
GreenSaturation: 1 images with edits (not null and not zero)
BlueSaturation: 1 images with edits (not null and not zero)
HueAdjustmentAqua: 2 images with edits (not null and not zero)
rating: 5 images with edits (not null and not zero)
LuminanceAdjustmentPurple: 10 images with edits (not null and not zero)
LuminanceAdjustmentMagenta: 18 images with edits (not null and not zero)
ColorGradeHighlightLum: 38 images with edits (not null and not zero)
SaturationAdjustmentAqua: 56 images with edits (not null and not zero)
HueAdjustmentPurple: 165 images with edits (not null and not zero)
HueAdjustmentGreen: 180 images with edits (not null and not zero)
HueAdjustmentYellow: 185 images with edits (not null and not zero)
SaturationAdjustmentGreen: 189 images with edits (not null and not zero)
SaturationAdjustmentMagenta: 190 images with edits (not null and not zero)
SaturationAdjustmentPurple: 197 images with edits (not null and not zero

In [48]:
# drop where camera model is nan
df = df[df['camera_model'].notna()]


In [49]:
# Exclude any rows where the filename ends with .dng (case-insensitive)
df = df[~df['filename'].str.lower().str.endswith('.dng')]


In [50]:
# Identify rows where all four columns are 0 (including cases where they may be missing)
zero_adjust_cols = ['Highlights2012', 'Shadows2012', 'Whites2012', 'Blacks2012']
# Make sure all required columns exist before proceeding
existing_cols = [col for col in zero_adjust_cols]
if len(existing_cols) == 4:
    all_zero_mask = (df[zero_adjust_cols] == 0).all(axis=1)
    num_rows_to_drop = all_zero_mask.sum()
    print(f"Rows with all tone sliders (Highlights2012/Shadows2012/Whites2012/Blacks2012) == 0: {num_rows_to_drop}")
    df = df[~all_zero_mask]


Rows with all tone sliders (Highlights2012/Shadows2012/Whites2012/Blacks2012) == 0: 19


In [51]:
df

Unnamed: 0,image_id,filename,folder,file_path,capture_time,rating,pick,camera_model,lens,aperture,...,ColorGradeMidtoneSat,ColorGradeMidtoneLum,ColorGradeHighlightLum,ShadowTint,RedHue,RedSaturation,GreenHue,GreenSaturation,BlueHue,BlueSaturation
1,618282,DSC08454.ARW,2025/2025-12-13/,2025/2025-12-13/DSC08454.ARW,2025-12-13T11:49:12.471,,0.0,ILCE-6700,18-50mm F2.8 DC DN | Contemporary 021,2.970854,...,0,0,0,,,,,,,
2,614396,DSC08454.ARW,2025/2025-12-13/,2025/2025-12-13/DSC08454.ARW,2025-12-13T11:49:12.471,,0.0,ILCE-6700,18-50mm F2.8 DC DN | Contemporary 021,2.970854,...,0,0,0,-3.0,23.0,,73.0,,,
3,625919,DSC08445.ARW,2025/2025-12-13/,2025/2025-12-13/DSC08445.ARW,2025-12-13T11:44:00.734,,0.0,ILCE-6700,18-50mm F2.8 DC DN | Contemporary 021,2.970854,...,17,15,0,-11.0,27.0,,72.0,,-13.0,
4,625291,DSC08445.ARW,2025/2025-12-13/,2025/2025-12-13/DSC08445.ARW,2025-12-13T11:44:00.734,,0.0,ILCE-6700,18-50mm F2.8 DC DN | Contemporary 021,2.970854,...,0,0,0,-7.0,24.0,,77.0,,,
5,625126,DSC08445.ARW,2025/2025-12-13/,2025/2025-12-13/DSC08445.ARW,2025-12-13T11:44:00.734,,0.0,ILCE-6700,18-50mm F2.8 DC DN | Contemporary 021,2.970854,...,0,0,0,-14.0,26.0,,75.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444,295,IMG_8054.CR2,2024/2024-12-29/,2024/2024-12-29/IMG_8054.CR2,2024-12-29T16:05:59.29,,0.0,Canon EOS 600D,EF-S18-55mm f/3.5-5.6 IS II,4.970854,...,0,0,0,,-12.0,,39.0,,-5.0,
3445,294,IMG_8053.CR2,2024/2024-12-29/,2024/2024-12-29/IMG_8053.CR2,2024-12-29T15:59:25.59,,0.0,Canon EOS 600D,EF-S18-55mm f/3.5-5.6 IS II,4.970854,...,0,0,0,-25.0,23.0,,67.0,,,
3446,7485,IMG_8052.CR2,2024/2024-12-29/,2024/2024-12-29/IMG_8052.CR2,2024-12-29T15:59:24.03,,0.0,Canon EOS 600D,EF-S18-55mm f/3.5-5.6 IS II,4.970854,...,0,0,0,-25.0,23.0,,67.0,,,
3447,7484,IMG_8051.CR2,2024/2024-12-29/,2024/2024-12-29/IMG_8051.CR2,2024-12-29T15:59:23.05,,0.0,Canon EOS 600D,EF-S18-55mm f/3.5-5.6 IS II,4.970854,...,0,0,0,-25.0,23.0,,67.0,,,


In [52]:
output_parquet_path = "processed_images.parquet"
df.to_parquet(output_parquet_path, index=False)
print(f"Saved processed DataFrame to {output_parquet_path}")


Saved processed DataFrame to processed_images.parquet
