In [None]:
# Importing necessary libraries

import pandas as pd  
import numpy as np  

# cosine_similarity is a function to measure similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import mean_squared_error, mean_absolute_error  


In [2]:
ratings_df = pd.read_csv(r"D:\INTERNSHIPS\RECOMMENDATION-SYSTEM\ratings.csv")
movies_df = pd.read_csv(r"D:\INTERNSHIPS\RECOMMENDATION-SYSTEM\movies.csv")

ratings_df.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
train_df, test_df = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42
)


In [4]:
user_item_matrix = train_df.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
)

user_item_matrix.head()


movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [5]:
user_item_matrix_filled = user_item_matrix.fillna(0)


In [6]:
user_similarity = cosine_similarity(user_item_matrix_filled)
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

user_similarity_df.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.016314,0.049021,0.165799,0.123392,0.118556,0.112563,0.142135,0.056088,0.012906,...,0.070901,0.152097,0.187324,0.067264,0.151517,0.139042,0.198771,0.232811,0.112174,0.143902
2,0.016314,1.0,0.0,0.004627,0.0,0.013391,0.029067,0.032754,0.0,0.080739,...,0.170123,0.020395,0.014415,0.0,0.0,0.019846,0.016076,0.05561,0.032404,0.07581
3,0.049021,0.0,1.0,0.0,0.00577,0.004833,0.0,0.005911,0.0,0.0,...,0.006401,0.005889,0.015344,0.0,0.012783,0.008884,0.004642,0.009433,0.0,0.031309
4,0.165799,0.004627,0.0,1.0,0.133565,0.090914,0.094497,0.050417,0.0,0.021991,...,0.075828,0.090252,0.241155,0.054366,0.081585,0.162277,0.083074,0.107276,0.02672,0.068325
5,0.123392,0.0,0.00577,0.133565,1.0,0.238812,0.071386,0.393773,0.0,0.006245,...,0.050523,0.343953,0.101064,0.159651,0.111464,0.086797,0.073278,0.09704,0.205395,0.05309


In [7]:
def predict_rating(user_id, movie_id, k=5):
    if movie_id not in user_item_matrix.columns:
        return np.nan
    
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:k+1]
    
    ratings = user_item_matrix.loc[similar_users.index, movie_id]
    ratings = ratings.dropna()
    
    if len(ratings) == 0:
        return np.nan
    
    return np.average(ratings, weights=similar_users.loc[ratings.index])


In [8]:
y_true = []
y_pred = []

for _, row in test_df.iterrows():
    pred = predict_rating(row['userId'], row['movieId'])
    if not np.isnan(pred):
        y_true.append(row['rating'])
        y_pred.append(pred)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")


RMSE: 1.0781
MAE : 0.8165


In [9]:
def recommend_movies(user_id, n=10):
    user_rated = ratings_df[ratings_df['userId'] == user_id]['movieId']
    all_movies = movies_df['movieId']
    
    recommendations = []
    for movie_id in all_movies:
        if movie_id not in user_rated.values:
            pred = predict_rating(user_id, movie_id)
            if not np.isnan(pred):
                recommendations.append((movie_id, pred))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    top_n = recommendations[:n]
    
    rec_df = pd.DataFrame(top_n, columns=['movieId', 'predicted_rating'])
    return rec_df.merge(movies_df, on='movieId')


In [10]:
sample_user = ratings_df['userId'].iloc[0]

recommendations = recommend_movies(sample_user, 10)
recommendations


Unnamed: 0,movieId,predicted_rating,title,genres
0,1246,5.0,Dead Poets Society (1989),Drama
1,1704,5.0,Good Will Hunting (1997),Drama|Romance
2,5989,5.0,Catch Me If You Can (2002),Crime|Drama
3,348,5.0,Bullets Over Broadway (1994),Comedy
4,514,5.0,"Ref, The (1994)",Comedy
5,1059,5.0,William Shakespeare's Romeo + Juliet (1996),Drama|Romance
6,1081,5.0,Victor/Victoria (1982),Comedy|Musical|Romance
7,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
8,1199,5.0,Brazil (1985),Fantasy|Sci-Fi
9,1230,5.0,Annie Hall (1977),Comedy|Romance


We implemented a User-Based Collaborative Filtering recommendation system using cosine similarity. 
The system identifies users with similar rating patterns and recommends items based on weighted average ratings.