<a href="https://colab.research.google.com/github/chinmayithumma/MLTasks/blob/main/task_4_recommendation-system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Technique: Matrix Factorization (TruncatedSVD)
# Dataset: MovieLens 100K
# No scikit-surprise (avoids numpy errors)

import os, zipfile
from urllib.request import urlretrieve
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

# 1Ô∏è‚É£ Download MovieLens 100K dataset
url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
if not os.path.exists("/content/ml-100k"):
    urlretrieve(url, "/content/ml-100k.zip")
    with zipfile.ZipFile("/content/ml-100k.zip", "r") as z:
        z.extractall("/content/")

# 2Ô∏è‚É£ Load ratings
cols = ['user_id','item_id','rating','timestamp']
df = pd.read_csv("/content/ml-100k/u.data", sep="\t", names=cols)
df.drop('timestamp', axis=1, inplace=True)

# 3Ô∏è‚É£ Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 4Ô∏è‚É£ Build user-item matrix (sparse)
user_ids = sorted(df['user_id'].unique())
item_ids = sorted(df['item_id'].unique())
user_to_idx = {u:i for i,u in enumerate(user_ids)}
item_to_idx = {m:i for i,m in enumerate(item_ids)}
n_users, n_items = len(user_ids), len(item_ids)

def make_matrix(data):
    rows = [user_to_idx[u] for u in data['user_id']]
    cols = [item_to_idx[i] for i in data['item_id']]
    vals = data['rating'].astype(float)
    return csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))

train_mat = make_matrix(train_df)

# 5Ô∏è‚É£ Matrix Factorization (SVD)
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(train_mat)
item_factors = svd.components_.T
pred_matrix = user_factors @ item_factors.T

# 6Ô∏è‚É£ Evaluate RMSE on test data
test_users = [user_to_idx[u] for u in test_df['user_id']]
test_items = [item_to_idx[i] for i in test_df['item_id']]
true_r = test_df['rating'].values
pred_r = [pred_matrix[u, i] for u,i in zip(test_users, test_items)]
rmse = np.sqrt(mean_squared_error(true_r, pred_r))
print(f"‚úÖ RMSE on test data: {rmse:.4f}")

# 7Ô∏è‚É£ Show Top-10 recommendations for a sample user
movies = pd.read_csv("/content/ml-100k/u.item", sep="|", encoding="latin-1", usecols=[0,1], names=["item_id","title"])
movies["item_id"] = movies["item_id"].astype(int)
movie_map = dict(zip(movies["item_id"], movies["title"]))

sample_user = 1
u = user_to_idx[sample_user]
user_scores = pred_matrix[u]
seen = set(train_df[train_df['user_id']==sample_user]['item_id'])
recs = [(iid, score) for iid, score in zip(item_ids, user_scores) if iid not in seen]
recs.sort(key=lambda x: x[1], reverse=True)

print(f"\nüé¨ Top-10 movie recommendations for user {sample_user}:")
for rank, (iid, score) in enumerate(recs[:10], 1):
    print(f"{rank}. {movie_map.get(iid, 'Unknown')}  (score={score:.3f})")


‚úÖ RMSE on test data: 2.8758

üé¨ Top-10 movie recommendations for user 1:
1. Heat (1995)  (score=3.278)
2. Blues Brothers, The (1980)  (score=3.035)
3. Piano, The (1993)  (score=2.823)
4. Reservoir Dogs (1992)  (score=2.718)
5. My Left Foot (1989)  (score=2.705)
6. Platoon (1986)  (score=2.589)
7. Boot, Das (1981)  (score=2.523)
8. Shine (1996)  (score=2.488)
9. Star Trek IV: The Voyage Home (1986)  (score=2.344)
10. Like Water For Chocolate (Como agua para chocolate) (1992)  (score=2.216)
