In [17]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error


In [18]:
train_df = pd.read_csv("../data/processed/train.csv")
test_df = pd.read_csv("../data/processed/test.csv")


In [19]:
# Pivot train data
user_item_matrix = train_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix.head()


movieId,10,11,12,13,14
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4.0,0.0,3.0,0.0,0.0
2,4.0,0.0,0.0,5.0,0.0
3,0.0,2.0,0.0,0.0,5.0
4,0.0,0.0,4.0,0.0,0.0
5,0.0,0.0,0.0,0.0,4.0


In [20]:
# TruncatedSVD approximates matrix factorization
svd = TruncatedSVD(n_components=5, random_state=42)
matrix_factors = svd.fit_transform(user_item_matrix)


In [21]:
pred_matrix = np.dot(matrix_factors, svd.components_)
pred_df = pd.DataFrame(pred_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)
pred_df.head()


movieId,10,11,12,13,14
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4.0,-2.465352e-16,3.0,2.220446e-16,-3.1518900000000006e-17
2,4.0,-1.437898e-16,1.665335e-16,5.0,7.300116e-17
3,0.0,2.0,-2.168883e-16,1.548522e-17,5.0
4,0.0,-3.287136e-16,4.0,-2.4651900000000002e-32,-4.2025190000000005e-17
5,0.0,4.440892e-16,-4.2025190000000005e-17,5.840093e-17,4.0


In [24]:
import numpy as np
from sklearn.metrics import mean_squared_error

y_true = []
y_pred = []

for _, row in test_df.iterrows():
    user = row['userId']
    item = row['movieId']
    rating = row['rating']
    
    # Check if user and item exist in prediction matrix
    if user in pred_df.index and item in pred_df.columns:
        pred_rating = pred_df.loc[user, item]
        if not np.isnan(pred_rating):
            y_true.append(rating)
            y_pred.append(pred_rating)

# Compute RMSE safely
if len(y_true) > 0:
    mse = mean_squared_error(y_true, y_pred)  # old sklearn versions
    rmse = np.sqrt(mse)
    print(f"RMSE: {rmse:.4f}")
else:
    print("No valid predictions to compute RMSE!")




RMSE: 4.1231


In [23]:
def recommend_for_user(user_id, n=5):
    if user_id not in pred_df.index:
        return []
    user_ratings = pred_df.loc[user_id]
    already_rated = train_df[train_df['userId']==user_id]['movieId'].tolist()
    # Drop already rated items and NaN values
    recommendations = user_ratings.drop(already_rated).dropna().sort_values(ascending=False).head(n)
    return recommendations.index.tolist()

# Example
recommend_for_user(1)



[13, 14, 11]