# # 1. Data Exploration
# Explore MovieLens 100K dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')
from config import CONFIG

In [None]:
# Load raw data
ratings = pd.read_csv('../data/raw/ml-100k/u.data', sep='\t', 
                       names=['user_id', 'item_id', 'rating', 'timestamp'])
users = pd.read_csv('../data/raw/ml-100k/u.user', sep='|',
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
items = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', encoding='latin-1',
                    names=['item_id', 'title', 'release_date', 'video_release', 'imdb_url',
                           'unknown', 'Action', 'Adventure', 'Animation', 'Children',
                           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
                           'Sci-Fi', 'Thriller', 'War', 'Western'])

In [None]:
# Basic statistics
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Total ratings: {len(ratings):,}")
print(f"Total users: {ratings['user_id'].nunique()}")
print(f"Total items: {ratings['item_id'].nunique()}")
print(f"Sparsity: {1 - len(ratings) / (ratings['user_id'].nunique() * ratings['item_id'].nunique()):.2%}")


In [None]:
# Rating distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Rating values
ratings['rating'].value_counts().sort_index().plot(kind='bar', ax=axes[0])
axes[0].set_title('Rating Distribution')
axes[0].set_xlabel('Rating')

# Ratings per user
ratings.groupby('user_id').size().plot(kind='hist', bins=50, ax=axes[1])
axes[1].set_title('Ratings per User')
axes[1].set_xlabel('Number of Ratings')

# Ratings per item
ratings.groupby('item_id').size().plot(kind='hist', bins=50, ax=axes[2])
axes[2].set_title('Ratings per Item')
axes[2].set_xlabel('Number of Ratings')

plt.tight_layout()
plt.savefig('../experiments/results/data_distribution.png')
plt.show()

# %%
# User demographics
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Age distribution
users['age'].plot(kind='hist', bins=20, ax=axes[0])
axes[0].set_title('Age Distribution')

# Gender distribution
users['gender'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Gender Distribution')

# Top occupations
users['occupation'].value_counts().head(10).plot(kind='bar', ax=axes[2])
axes[2].set_title('Top 10 Occupations')

plt.tight_layout()
plt.savefig('../experiments/results/user_demographics.png')
plt.show()

# %%
# Cold start analysis
user_rating_counts = ratings.groupby('user_id').size()
cold_users = user_rating_counts[user_rating_counts < 5].index
print(f"\nCold-start users (<5 ratings): {len(cold_users)} ({len(cold_users)/len(users)*100:.1f}%)")

# %%
# Save processed info
summary = {
    'n_users': ratings['user_id'].nunique(),
    'n_items': ratings['item_id'].nunique(),
    'n_ratings': len(ratings),
    'cold_users': len(cold_users)
}
pd.Series(summary).to_csv('../experiments/results/data_summary.csv')
print("\nâœ“ Data exploration complete!")

: 