## Implement a recommender system using collaborative filtering
#### What is collaborative filtering?
It is a method used in recommender systems that suggests items based on the preferences of similar users . It analyses user interaction data (eg. ratings or purchases) to identify groups of users with similar tastses and recommends items that those groups have enjoyed.

#### How does it work?
Data Collection
Similarity calc - between users and users; between movies and movies
recommendation

Two types:
1. user based
2. item based.

#### Objective 
To predict how a user would rate a movie they have not rated yet, using other similar users' ratings. The rating will be more affected by similar users and less by dissimilar users ratings. Thus the weighted sum.


In [1]:
from json import load
import pandas as pd
from dotenv import load_dotenv
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


load_dotenv()

True

In [2]:
dataset_path = os.environ.get("MOVIES_DATASET")
# Define column names (from the data format)
columns = ['user_id', 'movie_id', 'rating', 'timestamp']

# Load training data
train_df = pd.read_csv(dataset_path + 'u1.base', sep='\t', names=columns)

# Load test data
test_df = pd.read_csv(dataset_path + 'u1.test', sep='\t', names=columns)

# Display the first few rows
print("Training Data:")
print(train_df.head())

print("\nTest Data:")
print(test_df.head())

Training Data:
   user_id  movie_id  rating  timestamp
0        1         1       5  874965758
1        1         2       3  876893171
2        1         3       4  878542960
3        1         4       3  876893119
4        1         5       3  889751712

Test Data:
   user_id  movie_id  rating  timestamp
0        1         6       5  887431973
1        1        10       3  875693118
2        1        12       5  878542960
3        1        14       5  874965706
4        1        17       3  875073198


In [3]:
# Create user-movie rating matrix (rows: users, columns: movies)
rating_matrix = train_df.pivot_table(values='rating', index='user_id', columns='movie_id')

# Check shape and sample
print("Rating Matrix Shape:", rating_matrix.shape)
print(rating_matrix.head())

Rating Matrix Shape: (943, 1650)
movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   NaN   4.0   1.0   5.0   NaN  ...   
2          4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

movie_id  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
user_id                                                               
1          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5

In [4]:
# Create user-item matrix from train data
rating_matrix = train_df.pivot_table(values='rating', index='user_id', columns='movie_id')


In [5]:
# Users and matrix
users = rating_matrix.index.tolist()
num_users = len(users)

# Initialize similarity DataFrame
user_similarity_df = pd.DataFrame(np.zeros((num_users, num_users)), 
                                  index=users, columns=users)

# Loop over all pairs (symmetric, so we optimize a bit)
for i, u in enumerate(tqdm(users)):
    for j in range(i, num_users):
        v = users[j]
        
        # Get both users' rating vectors
        ru = rating_matrix.loc[u]
        rv = rating_matrix.loc[v]
        
        # Find common rated movies
        common = ru.notna() & rv.notna()
        
        if common.sum() == 0:
            sim = 0  # No common movies, similarity = 0
        else:
            ru_common = ru[common].values
            rv_common = rv[common].values
            # Compute cosine similarity
            numerator = np.dot(ru_common, rv_common)
            denominator = np.linalg.norm(ru_common) * np.linalg.norm(rv_common)
            sim = numerator / denominator if denominator != 0 else 0
            # Or we could have done:
            # sim = cosine_similarity([ru_common], [rv_common])[0][0]

        # Store both [u, v] and [v, u]
        user_similarity_df.loc[u, v] = sim
        user_similarity_df.loc[v, u] = sim

100%|██████████| 943/943 [01:41<00:00,  9.27it/s]


In [6]:
def predict_rating(user_id, movie_id, k=10):
    if movie_id not in rating_matrix.columns:
        return np.nan  # movie not in training set
    
    # Get similarities of current user with all other users
    sim_scores = user_similarity_df.loc[user_id]
    
    # Get users who rated this movie
    movie_ratings = rating_matrix[movie_id].dropna()
    
    # Only consider users who rated the movie
    sim_scores = sim_scores.loc[movie_ratings.index]
    
    # If no one has rated the movie, fallback
    if sim_scores.empty:
        return np.nan
    
    # Select top-k similar users
    top_k_users = sim_scores.abs().sort_values(ascending=False).head(k).index
    
    # Compute weighted average
    numer = 0
    denom = 0
    for v in top_k_users:
        sim = user_similarity_df.loc[user_id, v]
        rating = rating_matrix.loc[v, movie_id]
        numer += sim * rating
        denom += abs(sim)
    
    if denom == 0:
        return np.nan
    return numer / denom

In [7]:
from sklearn.metrics import mean_squared_error

k = 10  # Number of neighbors
predictions = []
actuals = []

for _, row in test_df.iterrows():
    user, movie, true_rating = row['user_id'], row['movie_id'], row['rating']
    pred = predict_rating(user, movie, k)
    
    if not np.isnan(pred):
        predictions.append(pred)
        actuals.append(true_rating)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actuals, predictions))
print(f"RMSE (k={k}): {rmse:.4f}")

RMSE (k=10): 1.0452
