# 01 - Data Exploration & Profiling

**Objective:** Load the Flickr dataset, understand its schema, and identify data quality issues.

## Schema
| Column | Description |
|--------|-------------|
| id | Photo ID |
| user | Flickr user ID |
| lat, long | GPS coordinates |
| tags | Comma-separated tags |
| title | Photo title |
| date_taken_* | When photo was taken (minute, hour, day, month, year) |
| date_upload_* | When photo was uploaded |

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
%matplotlib inline

print("Libraries loaded successfully!")

## 1. Load Dataset

In [None]:
# Load the dataset
DATA_PATH = '../data/flickr_data2.csv'

df = pd.read_csv(DATA_PATH)

print(f"Dataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# First look at the data
df.head(10)

In [None]:
# Data types and info
df.info()

## 2. Missing Values Analysis

In [None]:
# Calculate missing values
missing = pd.DataFrame({
    'missing_count': df.isnull().sum(),
    'missing_pct': (df.isnull().sum() / len(df) * 100).round(2)
})
missing = missing[missing['missing_count'] > 0].sort_values('missing_pct', ascending=False)

print("=== Missing Values ===")
if len(missing) > 0:
    print(missing)
else:
    print("No missing values found!")

# Visualize
if len(missing) > 0:
    fig, ax = plt.subplots(figsize=(10, 4))
    missing['missing_pct'].plot(kind='barh', ax=ax, color='coral')
    ax.set_xlabel('Missing %')
    ax.set_title('Missing Values by Column')
    plt.tight_layout()
    plt.show()

## 3. Duplicates Detection

In [None]:
# Check for duplicate photo IDs
dup_id = df['id'].duplicated().sum()
print(f"Duplicate photo IDs: {dup_id:,} ({dup_id/len(df)*100:.2f}%)")

# Check for exact duplicate rows
dup_rows = df.duplicated().sum()
print(f"Exact duplicate rows: {dup_rows:,} ({dup_rows/len(df)*100:.2f}%)")

# Check for duplicate coordinates (same user, same location)
dup_coords = df.duplicated(subset=['user', 'lat', 'long']).sum()
print(f"Duplicate (user + coordinates): {dup_coords:,} ({dup_coords/len(df)*100:.2f}%)")

In [None]:
# Show some duplicates if they exist
if dup_id > 0:
    dup_ids = df[df['id'].duplicated(keep=False)]['id'].unique()[:5]
    print("Sample duplicate photo IDs:")
    display(df[df['id'].isin(dup_ids)].head(10))

## 4. GPS Coordinates Analysis

In [None]:
# GPS statistics
print("=== GPS Coordinate Statistics ===")
print(f"\nLatitude range:  [{df['lat'].min():.6f}, {df['lat'].max():.6f}]")
print(f"Longitude range: [{df['long'].min():.6f}, {df['long'].max():.6f}]")

# Lyon approximate bounding box
LYON_LAT_MIN, LYON_LAT_MAX = 45.55, 45.95
LYON_LON_MIN, LYON_LON_MAX = 4.65, 5.10

# Check for coordinates outside Lyon area
outside_lyon = df[
    (df['lat'] < LYON_LAT_MIN) | (df['lat'] > LYON_LAT_MAX) |
    (df['long'] < LYON_LON_MIN) | (df['long'] > LYON_LON_MAX)
]
print(f"\nPoints outside Lyon area: {len(outside_lyon):,} ({len(outside_lyon)/len(df)*100:.2f}%)")

# Check for null/invalid coordinates
null_coords = df[df['lat'].isnull() | df['long'].isnull()]
print(f"Null coordinates: {len(null_coords):,}")

# Check for (0, 0) coordinates
zero_coords = df[(df['lat'] == 0) & (df['long'] == 0)]
print(f"Zero coordinates (0, 0): {len(zero_coords):,}")

In [None]:
# Visualize coordinate distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of latitudes
axes[0].hist(df['lat'], bins=100, color='steelblue', alpha=0.7)
axes[0].axvline(LYON_LAT_MIN, color='red', linestyle='--', label=f'Lyon min ({LYON_LAT_MIN})')
axes[0].axvline(LYON_LAT_MAX, color='red', linestyle='--', label=f'Lyon max ({LYON_LAT_MAX})')
axes[0].set_xlabel('Latitude')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Latitude Distribution')
axes[0].legend()

# Histogram of longitudes
axes[1].hist(df['long'], bins=100, color='darkorange', alpha=0.7)
axes[1].axvline(LYON_LON_MIN, color='red', linestyle='--', label=f'Lyon min ({LYON_LON_MIN})')
axes[1].axvline(LYON_LON_MAX, color='red', linestyle='--', label=f'Lyon max ({LYON_LON_MAX})')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Longitude Distribution')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot of coordinates
fig, ax = plt.subplots(figsize=(10, 10))

# Sample for performance
sample = df.sample(min(50000, len(df)), random_state=42)
ax.scatter(sample['long'], sample['lat'], alpha=0.1, s=1, c='blue')

# Draw Lyon bounding box
ax.axhline(LYON_LAT_MIN, color='red', linestyle='--', alpha=0.5)
ax.axhline(LYON_LAT_MAX, color='red', linestyle='--', alpha=0.5)
ax.axvline(LYON_LON_MIN, color='red', linestyle='--', alpha=0.5)
ax.axvline(LYON_LON_MAX, color='red', linestyle='--', alpha=0.5)

ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Photo Locations (50K sample)')
plt.show()

## 5. Date Analysis

In [None]:
# Reconstruct datetime from components
date_cols = ['date_taken_year', 'date_taken_month', 'date_taken_day', 
             'date_taken_hour', 'date_taken_minute']

print("=== Date Component Statistics ===")
for col in date_cols:
    if col in df.columns:
        print(f"{col}: min={df[col].min()}, max={df[col].max()}, null={df[col].isnull().sum()}")

In [None]:
# Create datetime column
def create_datetime(row):
    try:
        return datetime(
            int(row['date_taken_year']),
            int(row['date_taken_month']),
            int(row['date_taken_day']),
            int(row['date_taken_hour']),
            int(row['date_taken_minute'])
        )
    except:
        return None

# Apply to sample first to check for issues
sample_dates = df.head(1000).apply(create_datetime, axis=1)
invalid_dates = sample_dates.isnull().sum()
print(f"Invalid dates in first 1000 rows: {invalid_dates}")

# Check for unrealistic date ranges
print(f"\nYear range: {df['date_taken_year'].min()} - {df['date_taken_year'].max()}")
print(f"Future dates (year > 2026): {(df['date_taken_year'] > 2026).sum()}")
print(f"Very old dates (year < 2000): {(df['date_taken_year'] < 2000).sum()}")

In [None]:
# Year distribution
fig, ax = plt.subplots(figsize=(12, 4))
df['date_taken_year'].value_counts().sort_index().plot(kind='bar', ax=ax, color='teal')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Photos')
ax.set_title('Photos by Year Taken')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Month distribution
fig, ax = plt.subplots(figsize=(10, 4))
month_counts = df['date_taken_month'].value_counts().sort_index()
month_counts.plot(kind='bar', ax=ax, color='purple')
ax.set_xlabel('Month')
ax.set_ylabel('Number of Photos')
ax.set_title('Photos by Month (Seasonality)')
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=0)
plt.tight_layout()
plt.show()

## 6. Text Analysis (Tags & Titles)

In [None]:
# Tags analysis
print("=== Tags Analysis ===")
empty_tags = df['tags'].isnull() | (df['tags'] == '')
print(f"Photos without tags: {empty_tags.sum():,} ({empty_tags.sum()/len(df)*100:.1f}%)")

# Count tags per photo
def count_tags(tag_str):
    if pd.isna(tag_str) or tag_str == '':
        return 0
    return len(str(tag_str).split(','))

df['tag_count'] = df['tags'].apply(count_tags)
print(f"\nTags per photo: mean={df['tag_count'].mean():.1f}, median={df['tag_count'].median():.0f}, max={df['tag_count'].max()}")

In [None]:
# Most common tags
all_tags = []
for tags in df['tags'].dropna():
    all_tags.extend([t.strip().lower() for t in str(tags).split(',') if t.strip()])

from collections import Counter
tag_counts = Counter(all_tags)
print("\n=== Top 30 Tags ===")
for tag, count in tag_counts.most_common(30):
    print(f"{tag}: {count:,}")

In [None]:
# Title analysis
print("\n=== Title Analysis ===")
empty_titles = df['title'].isnull() | (df['title'] == '')
print(f"Photos without title: {empty_titles.sum():,} ({empty_titles.sum()/len(df)*100:.1f}%)")

# Title length
df['title_len'] = df['title'].fillna('').apply(len)
print(f"Title length: mean={df['title_len'].mean():.1f}, median={df['title_len'].median():.0f}, max={df['title_len'].max()}")

In [None]:
# Sample titles
print("\n=== Sample Titles ===")
sample_titles = df[df['title'].notna() & (df['title'] != '')]['title'].sample(20, random_state=42)
for i, title in enumerate(sample_titles, 1):
    print(f"{i}. {title}")

## 7. User Analysis

In [None]:
# User statistics
print("=== User Statistics ===")
print(f"Unique users: {df['user'].nunique():,}")

photos_per_user = df['user'].value_counts()
print(f"\nPhotos per user: mean={photos_per_user.mean():.1f}, median={photos_per_user.median():.0f}")
print(f"Max photos by single user: {photos_per_user.max():,}")
print(f"Users with only 1 photo: {(photos_per_user == 1).sum():,}")

In [None]:
# Top users
print("\n=== Top 10 Users by Photo Count ===")
print(photos_per_user.head(10))

## 8. Summary of Issues Found

Run this cell to generate a summary report.

In [None]:
print("="*60)
print("DATA QUALITY ISSUES SUMMARY")
print("="*60)

issues = []

# Duplicates
if dup_id > 0:
    issues.append(f"⚠️ Duplicate photo IDs: {dup_id:,}")
if dup_rows > 0:
    issues.append(f"⚠️ Exact duplicate rows: {dup_rows:,}")

# GPS
if len(null_coords) > 0:
    issues.append(f"⚠️ Null coordinates: {len(null_coords):,}")
if len(zero_coords) > 0:
    issues.append(f"⚠️ Zero (0,0) coordinates: {len(zero_coords):,}")
if len(outside_lyon) > 0:
    issues.append(f"⚠️ Points outside Lyon bbox: {len(outside_lyon):,}")

# Dates
future = (df['date_taken_year'] > 2026).sum()
old = (df['date_taken_year'] < 2000).sum()
if future > 0:
    issues.append(f"⚠️ Future dates (>2026): {future:,}")
if old > 0:
    issues.append(f"⚠️ Very old dates (<2000): {old:,}")

# Text
if empty_tags.sum() > len(df) * 0.1:
    issues.append(f"ℹ️ Photos without tags: {empty_tags.sum():,} ({empty_tags.sum()/len(df)*100:.1f}%)")
if empty_titles.sum() > len(df) * 0.1:
    issues.append(f"ℹ️ Photos without title: {empty_titles.sum():,} ({empty_titles.sum()/len(df)*100:.1f}%)")

if issues:
    for issue in issues:
        print(issue)
else:
    print("✅ No major issues found!")

print("\n" + "="*60)
print("NEXT STEPS: Data Cleaning")
print("="*60)
print("1. Remove duplicate rows/IDs")
print("2. Filter invalid GPS coordinates")
print("3. Handle date parsing issues")
print("4. Create cleaned Parquet file for efficiency")

---

## Save Profiling Results

In [None]:
# Save intermediate results
profiling_stats = {
    'total_rows': len(df),
    'unique_users': df['user'].nunique(),
    'duplicate_ids': dup_id,
    'duplicate_rows': dup_rows,
    'null_coords': len(null_coords),
    'outside_lyon': len(outside_lyon),
    'empty_tags': empty_tags.sum(),
    'empty_titles': empty_titles.sum(),
    'year_range': (int(df['date_taken_year'].min()), int(df['date_taken_year'].max())),
    'lat_range': (df['lat'].min(), df['lat'].max()),
    'lon_range': (df['long'].min(), df['long'].max()),
}

import json
with open('../reports/profiling_stats.json', 'w') as f:
    json.dump(profiling_stats, f, indent=2, default=str)

print("Profiling stats saved to reports/profiling_stats.json")