In [23]:
#Data set: MovieLens 100k (https://grouplens.org/datasets/movielens/100k/)
import pandas as pd
ratings_df = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'], header=None)


movies_df = pd.read_csv('../data/ml-100k/u.item', sep='|', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL'] + 
                        ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 
                         'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], 
                        header=None, encoding='latin-1')

users_df = pd.read_csv('../data/ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'], header=None)


In [24]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df['rating_year'] = ratings_df['timestamp'].dt.year
ratings_df['rating_month'] = ratings_df['timestamp'].dt.month
ratings_df['rating_dayofweek'] = ratings_df['timestamp'].dt.day_name()

In [25]:
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')
movies_df['release_year'] = movies_df['release_date'].dt.year
movies_df['release_month'] = movies_df['release_date'].dt.month
movies_df['release_dayofweek'] = movies_df['release_date'].dt.day_name()

In [26]:
genre_cols = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 
              'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies_df = movies_df[['movie_id', 'release_year', 'release_month', 'release_dayofweek'] + genre_cols]

In [41]:
merged_df = (
    ratings_df
    .merge(movies_df, on='movie_id', how='left')
    .merge(users_df[['user_id', 'age', 'gender', 'occupation']], on='user_id', how='left')
)

In [42]:
merged_df = pd.get_dummies(merged_df, columns=['rating_dayofweek', 'release_dayofweek', 'gender', 'occupation'])

In [43]:
merged_df = merged_df.drop(columns=['timestamp'])

In [44]:
merged_df

Unnamed: 0,user_id,movie_id,rating,rating_year,rating_month,release_year,release_month,Action,Adventure,Animation,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,196,242,3,1997,12,1997.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,186,302,3,1998,4,1997.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,377,1,1997,11,1994.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,244,51,2,1997,11,1994.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,166,346,1,1998,2,1997.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,476,3,1997,11,1996.0,9.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99996,716,204,5,1997,11,1985.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,276,1090,1,1997,9,1993.0,1.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,225,2,1997,12,1996.0,11.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
merged_df = merged_df.dropna()

In [46]:
test_user_ids = merged_df['user_id'].sample(5, random_state=42)
test_df = merged_df[merged_df['user_id'].isin(test_user_ids)]

In [47]:
train_df = merged_df[~merged_df['user_id'].isin(test_user_ids)]

In [48]:
from sklearn.preprocessing import StandardScaler
numeric_cols = ['age', 'rating_year', 'rating_month', 'release_year', 'release_month', 'movie_id']
scaler = StandardScaler()
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [49]:
train_df

Unnamed: 0,user_id,movie_id,rating,rating_year,rating_month,release_year,release_month,Action,Adventure,Animation,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,196,-0.554703,3,-0.939151,1.195567,0.639633,-0.525643,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,186,-0.373287,3,1.064791,-0.654627,0.639633,-0.525643,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,-0.146516,1,-0.939151,0.964292,0.428103,-0.525643,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,244,-1.132213,2,-0.939151,0.964292,0.428103,-0.525643,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,166,-0.240248,1,1.064791,-1.117175,0.639633,-0.525643,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,0.152821,3,-0.939151,0.964292,0.569123,2.036742,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99996,716,-0.669600,5,-0.939151,0.964292,-0.206489,-0.525643,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,276,2.009318,1,-0.939151,0.501744,0.357592,-0.525643,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,-0.606104,2,-0.939151,1.195567,0.569123,2.677338,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
import numpy as np

unique_users = train_df['user_id'].unique()
T = len(unique_users)
n = train_df.groupby('user_id').size().max()
d = train_df.shape[1] - 2  

X = np.zeros((T, n, d))
Y = np.zeros((T, n))

for i, user_id in enumerate(unique_users):
    user_data = train_df[train_df['user_id'] == user_id]
    user_features = user_data.drop(columns=['user_id', 'rating']).to_numpy()
    user_ratings = user_data['rating'].to_numpy()
    
    X[i, :len(user_features), :] = user_features
    Y[i, :len(user_ratings)] = user_ratings

In [55]:

unique_users = test_df['user_id'].unique()
T = len(unique_users)
n = test_df.groupby('user_id').size().max()
d = test_df.shape[1] - 2  

X_test = np.zeros((T, n, d))
Y_test = np.zeros((T, n))

for i, user_id in enumerate(unique_users):
    user_data = test_df[test_df['user_id'] == user_id]
    user_features = user_data.drop(columns=['user_id', 'rating']).to_numpy()
    user_ratings = user_data['rating'].to_numpy()
    
    X_test[i, :len(user_features), :] = user_features
    Y_test[i, :len(user_ratings)] = user_ratings

In [60]:
X_test = []
Y_test = []

for user_id in unique_users:
    user_data = test_df[test_df['user_id'] == user_id]
    user_features = user_data.drop(columns=['user_id', 'rating']).to_numpy()
    user_ratings = user_data['rating'].to_numpy()
    
    X_test.append(user_features)  
    Y_test.append(user_ratings)   

In [59]:
np.save('../data/X_ml.npy', X)
np.save('../data/Y_ml.npy', Y)

In [64]:
import pickle

with open('../data/X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)
with open('../data/Y_test.pkl', 'wb') as f:
    pickle.dump(Y_test, f)