# Data Loader Test

This notebook tests the data cleaning pipeline and generates a before/after comparison report.

In [2]:
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd

# Add project root to path
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

from src.data_loader import (
    load_raw_data,
    remove_corrupted_dates,
    remove_duplicates,
    filter_lyon_bbox,
    get_data_stats,
    LYON_BBOX
)

## 1. Load Raw Data

In [3]:
df_raw = load_raw_data()
print(f"Raw data: {len(df_raw):,} rows")
df_raw.head()

Raw data: 420,240 rows


  df = pd.read_csv(RAW_DATA_PATH, nrows=nrows)


Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year
0,4395181099,30624617@N03,45.754858,4.82171,"chair,lyon,rhône,chaise,rhônealpes",Chaises avec vue,11.0,15,28,2,2010,23,20,28.0,2,2010.0
1,4394748717,35853470@N00,45.75327,4.862953,,,51.0,17,28,2,2010,52,17,28.0,2,2010.0
2,4394694699,11817998@N05,45.760655,4.846564,"365,iphone",59/365 - R46 V103 B163,29.0,17,28,2,2010,33,17,28.0,2,2010.0
3,4394803790,11545749@N06,45.784,4.874072,"nin,nineinchnails,gift,screening,toiou,avott",2010-01-29 Toiou Avott Lyon,15.0,20,28,1,2010,38,12,28.0,2,2010.0
4,4394803554,11545749@N06,45.784,4.874072,"lyon,nin,nineinchnails,gift,screening,toiou,avott",2010-01-28 Toiou Avott Lyon,10.0,20,28,1,2010,38,12,28.0,2,2010.0


## 2. Before Cleaning: Data Quality Issues

In [4]:
# Check date component ranges
print("Date component ranges in raw data:")
print(f"  Year:   {df_raw['date_taken_year'].min()} - {df_raw['date_taken_year'].max()}")
print(f"  Month:  {df_raw['date_taken_month'].min()} - {df_raw['date_taken_month'].max()}")
print(f"  Day:    {df_raw['date_taken_day'].min()} - {df_raw['date_taken_day'].max()}")
print(f"  Hour:   {df_raw['date_taken_hour'].min()} - {df_raw['date_taken_hour'].max()}")

Date component ranges in raw data:
  Year:   1 - 2238
  Month:  1 - 2011
  Day:    1 - 2013
  Hour:   0 - 2013


In [5]:
# Count invalid date values
raw_issues = {
    'Invalid months (>12)': int((df_raw['date_taken_month'] > 12).sum()),
    'Invalid days (>31)': int((df_raw['date_taken_day'] > 31).sum()),
    'Invalid hours (>23)': int((df_raw['date_taken_hour'] > 23).sum()),
    'Invalid years (<1990 or >2025)': int(((df_raw['date_taken_year'] < 1990) | (df_raw['date_taken_year'] > 2025)).sum()),
    'Duplicate photos': len(df_raw) - df_raw['id'].nunique()
}

pd.DataFrame.from_dict(raw_issues, orient='index', columns=['Count'])

Unnamed: 0,Count
Invalid months (>12),84
Invalid days (>31),2
Invalid hours (>23),92
Invalid years (<1990 or >2025),414
Duplicate photos,252143


## 3. Apply Cleaning Pipeline

In [6]:
# Step 1: Remove corrupted dates
df_step1 = remove_corrupted_dates(df_raw)
print(f"Step 1 - Remove corrupted dates: {len(df_raw):,} -> {len(df_step1):,} (removed {len(df_raw) - len(df_step1):,})")

# Step 2: Remove duplicates
df_step2, dup_removed = remove_duplicates(df_step1)
print(f"Step 2 - Remove duplicates:      {len(df_step1):,} -> {len(df_step2):,} (removed {dup_removed:,})")

# Step 3: Filter to Lyon bbox
df_clean = filter_lyon_bbox(df_step2)
print(f"Step 3 - Lyon bbox filter:       {len(df_step2):,} -> {len(df_clean):,} (removed {len(df_step2) - len(df_clean):,})")

print(f"\nFinal: {len(df_clean):,} rows ({len(df_clean)/len(df_raw)*100:.1f}% of original)")

Step 1 - Remove corrupted dates: 420,240 -> 419,826 (removed 414)
Step 2 - Remove duplicates:      419,826 -> 167,954 (removed 251,872)
Step 3 - Lyon bbox filter:       167,954 -> 167,954 (removed 0)

Final: 167,954 rows (40.0% of original)


## 4. After Cleaning: Data Quality Check

In [7]:
# Check date component ranges after cleaning
print("Date component ranges after cleaning:")
print(f"  Year:   {df_clean['date_taken_year'].min()} - {df_clean['date_taken_year'].max()}")
print(f"  Month:  {df_clean['date_taken_month'].min()} - {df_clean['date_taken_month'].max()}")
print(f"  Day:    {df_clean['date_taken_day'].min()} - {df_clean['date_taken_day'].max()}")
print(f"  Hour:   {df_clean['date_taken_hour'].min()} - {df_clean['date_taken_hour'].max()}")

Date component ranges after cleaning:
  Year:   1991 - 2019
  Month:  1 - 12
  Day:    1 - 31
  Hour:   0 - 23


In [8]:
# Final statistics
stats = get_data_stats(df_clean)
pd.DataFrame.from_dict(stats, orient='index', columns=['Value'])

Unnamed: 0,Value
total_rows,167954
unique_photos,167954
unique_users,5145
lat_range,"(45.6552, 45.85495)"
lon_range,"(4.720312, 5.006709)"
year_range,"(1991, 2019)"
null_coords,0
empty_tags,41977
empty_titles,15776


## 5. Before vs After Comparison

In [9]:
comparison = pd.DataFrame({
    'Before': [
        f"{len(df_raw):,}",
        f"{df_raw['id'].nunique():,}",
        f"{df_raw['user'].nunique():,}",
        f"{int(df_raw['date_taken_year'].min())} - {int(df_raw['date_taken_year'].max())}",
        f"{int(df_raw['date_taken_month'].min())} - {int(df_raw['date_taken_month'].max())}",
    ],
    'After': [
        f"{len(df_clean):,}",
        f"{stats['unique_photos']:,}",
        f"{stats['unique_users']:,}",
        f"{stats['year_range'][0]} - {stats['year_range'][1]}",
        f"{int(df_clean['date_taken_month'].min())} - {int(df_clean['date_taken_month'].max())}",
    ]
}, index=['Total rows', 'Unique photos', 'Unique users', 'Year range', 'Month range'])

comparison

Unnamed: 0,Before,After
Total rows,420240,167954
Unique photos,168097,167954
Unique users,5158,5145
Year range,1 - 2238,1991 - 2019
Month range,1 - 2011,1 - 12


## 6. Generate Report

In [10]:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

report = f"""# Data Cleaning Report

**Generated:** {timestamp}

## Summary

| Metric | Before | After | Change |
|--------|--------|-------|--------|
| Total rows | {len(df_raw):,} | {len(df_clean):,} | -{len(df_raw) - len(df_clean):,} ({(len(df_raw) - len(df_clean)) / len(df_raw) * 100:.1f}%) |
| Unique photos | {df_raw['id'].nunique():,} | {stats['unique_photos']:,} | - |
| Unique users | {df_raw['user'].nunique():,} | {stats['unique_users']:,} | - |

## Cleaning Steps

1. **Remove corrupted dates:** {len(df_raw):,} → {len(df_step1):,} (removed {len(df_raw) - len(df_step1):,})
2. **Remove duplicates:** {len(df_step1):,} → {len(df_step2):,} (removed {dup_removed:,})
3. **Lyon bbox filter:** {len(df_step2):,} → {len(df_clean):,} (removed {len(df_step2) - len(df_clean):,})

## Date Range Comparison

| Component | Before | After |
|-----------|--------|-------|
| Year | {int(df_raw['date_taken_year'].min())} – {int(df_raw['date_taken_year'].max())} | {stats['year_range'][0]} – {stats['year_range'][1]} |
| Month | {int(df_raw['date_taken_month'].min())} – {int(df_raw['date_taken_month'].max())} | {int(df_clean['date_taken_month'].min())} – {int(df_clean['date_taken_month'].max())} |

## Conclusion

- ✅ All date values now valid
- ✅ No duplicate photos
- ✅ All coordinates in Lyon bbox
- ⚠️ {stats['empty_tags']:,} photos ({stats['empty_tags']/len(df_clean)*100:.0f}%) have no tags
- ⚠️ {stats['empty_titles']:,} photos ({stats['empty_titles']/len(df_clean)*100:.0f}%) have no title
"""

# Save report
report_path = PROJECT_ROOT / "reports" / "data_cleaning_report.md"
report_path.write_text(report)
print(f"Report saved to: {report_path}")

Report saved to: /Users/diegoaquino/IF4/DataMining/grandlyon-photo-clusters/reports/data_cleaning_report.md


In [None]:
from IPython.display import Markdown
Markdown(report)