In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pickle
import sys
sys.path.append('..')
from config import CONFIG

# %%
# Load data
ratings = pd.read_csv('../data/raw/ml-100k/u.data', sep='\t',
                       names=['user_id', 'item_id', 'rating', 'timestamp'])
users = pd.read_csv('../data/raw/ml-100k/u.user', sep='|',
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
items = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', encoding='latin-1',
                    names=['item_id', 'title', 'release_date', 'video_release', 'imdb_url',
                           'unknown', 'Action', 'Adventure', 'Animation', 'Children',
                           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
                           'Sci-Fi', 'Thriller', 'War', 'Western'])

# %%
# Process user features
print("Processing user features...")

# Age binning
users['age_bin'] = pd.cut(users['age'], bins=[0, 18, 25, 35, 45, 55, 100],
                          labels=['<18', '18-25', '25-35', '35-45', '45-55', '55+'])

# Encode categorical features
le_gender = LabelEncoder()
le_occupation = LabelEncoder()
le_age = LabelEncoder()

users['gender_enc'] = le_gender.fit_transform(users['gender'])
users['occupation_enc'] = le_occupation.fit_transform(users['occupation'])
users['age_enc'] = le_age.fit_transform(users['age_bin'])

# Create user feature matrix
user_features = users[['user_id', 'gender_enc', 'occupation_enc', 'age_enc']].copy()
print(f"User features shape: {user_features.shape}")

# %%
# Process item features
print("\nProcessing item features...")

# Genre columns
genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
              'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

item_features = items[['item_id'] + genre_cols].copy()
print(f"Item features shape: {item_features.shape}")

# %%
# Prepare train/test split
print("\nSplitting data...")

# Sort by timestamp for temporal split
ratings_sorted = ratings.sort_values('timestamp')

# Split: 70% train, 15% validation, 15% test
train_data, temp_data = train_test_split(ratings_sorted, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Train size: {len(train_data):,}")
print(f"Validation size: {len(val_data):,}")
print(f"Test size: {len(test_data):,}")

# %%
# Simulate cold-start users in test set
print("\nSimulating cold-start scenario...")

# Find users with few ratings in training
train_user_counts = train_data.groupby('user_id').size()
cold_users = train_user_counts[train_user_counts < 5].index.tolist()

# Mark cold users in test
test_data['is_cold_user'] = test_data['user_id'].isin(cold_users)
print(f"Cold-start users in test: {test_data['is_cold_user'].sum()} ratings")

# %%
# Create interaction matrix for training
print("\nCreating interaction matrix...")

n_users = ratings['user_id'].max() + 1
n_items = ratings['item_id'].max() + 1

# Binary interaction matrix
train_matrix = np.zeros((n_users, n_items))
for _, row in train_data.iterrows():
    train_matrix[row['user_id'], row['item_id']] = row['rating']

print(f"Interaction matrix shape: {train_matrix.shape}")

# %%
# Save processed data
print("\nSaving processed data...")

# Save dataframes
train_data.to_csv('../data/processed/train.csv', index=False)
val_data.to_csv('../data/processed/val.csv', index=False)
test_data.to_csv('../data/processed/test.csv', index=False)
user_features.to_csv('../data/processed/user_features.csv', index=False)
item_features.to_csv('../data/processed/item_features.csv', index=False)

# Save encoders and matrix
with open('../data/processed/encoders.pkl', 'wb') as f:
    pickle.dump({
        'le_gender': le_gender,
        'le_occupation': le_occupation,
        'le_age': le_age,
        'n_users': n_users,
        'n_items': n_items
    }, f)

np.save('../data/processed/train_matrix.npy', train_matrix)

print("âœ“ Feature engineering complete!")
print(f"\nFiles saved to ../data/processed/")