Load ratings + movies; check shapes, nulls, dtypes, duplicates.

Basic profiles:

ratings per user/movie (plots), rating distribution, sparsity %.

time distribution (if timestamps available).

Quick insights: top genres, most‑rated titles, long‑tail effect.

In [None]:
#Exploratory Data Analysis

import pandas as pd
import os
import random
os.chdir(r"C:\Users\Emmet\PycharmProjects\streaming-recs-project")
ratings = pd.read_csv('data/ml32/ratings.csv')
movies = pd.read_csv('data/ml32/movies.csv')
df = ratings.merge(movies, on='movieId', how='left')

# sparsity
n_users = df['userId'].nunique()
n_items = df['movieId'].nunique()
sparsity = 1 - (len(df)/ (n_users*n_items))


#check data
import matplotlib.pyplot as plt

# Histogram of rating values
plt.figure(figsize=(6,4))
df['rating'].hist(bins=10, edgecolor='black')
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()

# Top 10 most-rated movies
top_movies = (
    df.groupby('title')['rating']
      .count()
      .sort_values(ascending=False)
      .head(10)
)

plt.figure(figsize=(8,5))
top_movies.plot(kind='barh')
plt.title("Top 10 Most-Rated Movies")
plt.xlabel("Number of Ratings")
plt.gca().invert_yaxis()  # so the top movie is at the top
plt.show()

# Ratings per user
ratings_per_user = df.groupby('userId')['rating'].count()
fig, ax = plt.subplots(figsize=(7,4))
ax.hist(ratings_per_user.clip(upper=ratings_per_user.quantile(0.98)),
        bins=40, edgecolor='black')
ax.set_yscale('log')
ax.set_title("Ratings per User (y=log, clipped at P98)")
ax.set_xlabel("Number of Ratings")
ax.set_ylabel("Number of Users (log scale)")
plt.show()

# Ratings per movie
ratings_per_movie = df.groupby('movieId')['rating'].count()

cap_value = ratings_per_movie.quantile(0.98)
plt.figure(figsize=(6,4))
plt.hist(ratings_per_movie.clip(upper=cap_value),
         bins=40, edgecolor='black')
plt.yscale('log')  # log-based y-axis
plt.title("Distribution of Ratings per Movie (log y, clipped at P98)")
plt.xlabel("Number of Ratings")
plt.ylabel("Number of Movies (log scale)")
plt.show()



#summary Table
print(f"Number of users: {n_users}")
print(f"Number of movies: {n_items}")
print(f"Number of ratings: {len(df)}")
print(f"Sparsity: {sparsity:.4f}")


Popularity recommender

Recommend the same top-N most-rated movies to every user.

Average-rating recommender: Recommend movies with the highest average rating (with a minimum rating count threshold to avoid unreliable averages).

Random recommender - Just to see the “floor” performance.

In [None]:
#Popularity Recommender

#Group DataFrame by movieId and count ratings.
    #set correct path
os.chdir(r"C:\Users\Emmet\PycharmProjects\streaming-recs-project")
    #create dataframe from ratings and movies
ratings = pd.read_csv('data/ml32/ratings.csv')
movies = pd.read_csv('data/ml32/movies.csv')
df = ratings.merge(movies, on='movieId', how='left')

movie_count = df.groupby('movieId')['rating'].count()

#sort descending by rating count.
movie_count_sorted = movie_count.sort_values(ascending=False)

#join with the movies DataFrame to get titles
movie_count_df = movie_count_sorted.reset_index(name='num_ratings')
popular_movies = movie_count_df.merge(
    movies[['movieId', 'title']],
    on='movieId',
    how='left'
)

# Get the top 10 most-rated movies
top10_popular = popular_movies.head(10)
top10_popular.index = range(1, len(top10_popular) + 1)

print("Top 10 Most Popular Movies:")
print(top10_popular[['title', 'num_ratings']])



In [None]:
#Average Rating Recommender

#Group DataFrame by movieId and count ratings.
    #set correct path
os.chdir(r"C:\Users\Emmet\PycharmProjects\streaming-recs-project")
    #create dataframe from ratings and movies
ratings = pd.read_csv('data/ml32/ratings.csv')
movies = pd.read_csv('data/ml32/movies.csv')
df = ratings.merge(movies, on='movieId', how='left')

movie_stats = df.groupby('movieId')['rating'].agg(['mean','count']).reset_index()
movie_stats.columns = ['movieId','avg_rating','num_rating']

#sets minimum number of ratings to be considered
min_rating = 50
filtered_movies = movie_stats[movie_stats['num_rating'] >= min_rating]

sorted_movies = filtered_movies.sort_values(by='avg_rating', ascending=False)

avg_rating_recs = sorted_movies.merge(
    movies[['movieId', 'title']],
    on='movieId',
    how='left'
)

top10_avg_rating = avg_rating_recs.head(10)
top10_avg_rating.index = range(1, len(top10_avg_rating) + 1)
print(top10_avg_rating[['title', 'avg_rating', 'num_rating']])


In [29]:
#BUILD RECOMMENDER

#create data frame
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import os
os.chdir(r"C:\Users\Emmet\PycharmProjects\streaming-recs-project")
dtypes = {
    'userId': 'int32',
    'movieId': 'int32',
    'rating': 'float32',
    'timestamp': 'int32',
}
df = pd.read_csv('data/ml32/ratings.csv', dtype=dtypes, low_memory=False)
movies = pd.read_csv('data/ml32/movies.csv')

#drop timestamp
df.drop('timestamp', axis=1, inplace=True)

#create user-item rating matrix
#To make sure the df isn't too big we're filtering down to the 500 most active users and most popular movies.

active_users = df['userId'].value_counts().head(500).index
popular_movies = df['movieId'].value_counts().head(500).index
df_filtered = df[df['userId'].isin(active_users) & df['movieId'].isin(popular_movies)]

#create new matrix
user_item_matrix = df_filtered.pivot_table(
    index='userId',
    columns='movieId',
    values = 'rating',
)

print(user_item_matrix.shape) #number of users and movies
print(user_item_matrix.head(3)) # preview first 3 users




(500, 500)
movieId  1       2       6       10      11      16      17      19      \
userId                                                                    
28          4.0     3.0     3.0     3.0     5.0     3.0     4.0     NaN   
188         4.0     4.0     5.0     NaN     3.0     5.0     2.5     3.5   
265         5.0     4.0     NaN     4.0     NaN     4.0     NaN     NaN   

movieId  21      25      ...  119145  122882  122886  122904  122912  134130  \
userId                   ...                                                   
28          4.0     3.0  ...     4.0     5.0     3.5     4.0     3.5     5.0   
188         4.5     2.5  ...     NaN     NaN     NaN     NaN     NaN     NaN   
265         3.5     NaN  ...     5.0     4.5     4.0     4.0     4.0     4.0   

movieId  134853  148626  152081  164179  
userId                                   
28          4.5     5.0     4.0     4.0  
188         NaN     NaN     NaN     NaN  
265         NaN     5.0     NaN     5.0  

[

In [None]:
#compute item to item similarity

user_item_filled = user_item_matrix.fillna(0)
user_item_matrix = user_item_filled.T

#cosine similarity
item_similarity = cosine_similarity(user_item_matrix)
item_similarity_df = user_item_matrix.corr(method='pearson', min_periods=10)



In [33]:
def get_similar_movies(movie_id, top_n=10):
    similar_scores = item_similarity_df[movie_id].drop(movie_id)
    similar_ids = similar_scores.sort_values(ascending=False).head(top_n).index

    return movies[movies['movieId'].isin(similar_ids)][['title']]

movie_marker = movies.loc[movies['title'] == 'Toy Story (1995)', 'movieId'].values[0]
print(get_similar_movies(movie_marker, top_n=5))

                        title
359     Lion King, The (1994)
3021       Toy Story 2 (1999)
4781    Monsters, Inc. (2001)
6259      Finding Nemo (2003)
8248  Incredibles, The (2004)


In [39]:
#recommend movies to a specific user

import random

def recommend_for_user(user_id, user_item_matrix, item_similarity_df, top_n=10):
    #get user rating
    user_ratings = user_item_matrix.loc[user_id].dropna()


    # Make sure we're using movie titles, not IDs
    if not set(user_ratings.index).issubset(item_similarity_df.columns):
        raise ValueError("Mismatch between user_item_matrix columns and similarity matrix.")

    #store scores for all candidate movies
    scores = {}

    for movie, rating, in user_ratings.items():
        similar_movie = item_similarity_df[movie].drop(movie)

        for similar_movie, similarity in similar_movie.items():
            #only consider if user hasnt seen movie
            if pd.isna(user_item_matrix.loc[user_id, similar_movie]):
                scores[similar_movie] = scores.get(similar_movie, 0) + similarity*rating

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    ranked_with_titles = [(movies.loc[movies['movieId'] == mid, 'title'].values[0], score) for mid, score in ranked]

    return ranked_with_titles



#example use:

user_id = random.choice(user_item_matrix.index.tolist()) # pick a user from your filtered dataset
recommendations = recommend_for_user(user_id, user_item_matrix, item_similarity_df, top_n=5)

print(f"Top recommendations for user {user_id}:")
for title, score in recommendations:
    print(f"{title} (score: {score:.3f})")



Top recommendations for user 133464:
Toy Story (1995) (score: 227.875)
Toy Story 2 (1999) (score: 216.645)
Star Trek (2009) (score: 209.812)
X2: X-Men United (2003) (score: 201.474)
Erin Brockovich (2000) (score: 200.662)


In [47]:
# EVALUATE AND CHECK RECOMMENDATIONS

from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error

#split original ratings into train and test sets
train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=42)

print(f"Train test split: {len(train_df)} ratings")
print(f"Test set: {len(test_df)} ratings")

train_matrix = train_df.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
)

#fill NaN with 0 for similarity calculation
train_matrix_filled = train_matrix.fillna(0)

item_similarity = cosine_similarity(train_matrix_filled.T)
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=train_matrix_filled.columns,
    columns=train_matrix_filled.columns
)

def predict_rating(user_id, movie_id, user_item_matrix, item_similarity_df):
    # Ratings the user has made
    user_ratings = user_item_matrix.loc[user_id]

    # Similarities for the target movie
    similarities = item_similarity_df[movie_id]

    # Only consider movies the user has rated
    rated_mask = ~user_ratings.isna()
    relevant_ratings = user_ratings[rated_mask]
    relevant_similarities = similarities[rated_mask]

    if relevant_similarities.sum() == 0:
        return None  # Can't make prediction

    return (relevant_ratings * relevant_similarities).sum() / relevant_similarities.sum()



predictions = []
actuals = []

for _, row in test_df.iterrows():
    pred = predict_rating(row['userId'], row['movieId'], train_matrix, item_similarity_df)
    if pred is not None:
        predictions.append(pred)
        actuals.append(row['rating'])

rmse = sqrt(mean_squared_error(actuals, predictions))
print(f"RMSE: {rmse:.4f}")





Train test split: 155327 ratings
Test set: 38832 ratings
RMSE: 0.9125


In [56]:
sample_user = random.choice(train_matrix.index.tolist())
recs = recommend_for_user(sample_user, train_matrix, item_similarity_df, top_n=5)

print(f"Top recommendations for user {sample_user}:")
for title, score in recs:
    print(f"{title} (score: {score:.3f})")

Top recommendations for user 78213:
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) (score: 816.502)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995) (score: 802.570)
Star Wars: Episode IV - A New Hope (1977) (score: 798.314)
Pulp Fiction (1994) (score: 796.558)
Seven (a.k.a. Se7en) (1995) (score: 795.889)
