In [2]:
import numpy as np
import pandas as pd
import os
import zipfile
import urllib.request

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
DATASET_URL = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
DATASET_ZIP = "ml-latest-small.zip"
DATASET_DIR = "ml-latest-small"

In [4]:
if not os.path.exists(DATASET_ZIP):
    urllib.request.urlretrieve(DATASET_URL, DATASET_ZIP)
if not os.path.exists(DATASET_DIR):
    with zipfile.ZipFile(DATASET_ZIP, 'r') as zip_ref:
        zip_ref.extractall(".")

In [5]:
movies = pd.read_csv(f"{DATASET_DIR}/movies.csv")

In [6]:
movies['content_soup'] = movies['genres'].apply(lambda x: ' '.join(x.lower().replace('-', '').split('|')))


In [7]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['content_soup'])

In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [10]:
def get_recommendations(title, top_n=5):
    """
    Given a movie title, return the top_n most similar movies (excluding itself).
    """
    if title not in indices:
        return f"Movie '{title}' not found in database."
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # skip itself (first item)
    movie_indices = [i[0] for i in sim_scores]
    return movies[['title', 'genres']].iloc[movie_indices]

In [11]:
print("Top 5 movies similar to 'Toy Story (1995)':")
print(get_recommendations('Toy Story (1995)'))

Top 5 movies similar to 'Toy Story (1995)':
                                               title  \
1706                                     Antz (1998)   
2355                              Toy Story 2 (1999)   
2809  Adventures of Rocky and Bullwinkle, The (2000)   
3000                Emperor's New Groove, The (2000)   
3568                           Monsters, Inc. (2001)   

                                           genres  
1706  Adventure|Animation|Children|Comedy|Fantasy  
2355  Adventure|Animation|Children|Comedy|Fantasy  
2809  Adventure|Animation|Children|Comedy|Fantasy  
3000  Adventure|Animation|Children|Comedy|Fantasy  
3568  Adventure|Animation|Children|Comedy|Fantasy  


A major limitation of a purely content-based recommendation system is that it can only suggest items similar to those a user has already interacted with, based on item features. This means it may fail to capture broader user preferences and cannot leverage the experiences or preferences of other users. As a result, users may miss out on discovering diverse or surprising content that they actually might get enjoy.

##Collaborative Filtering Recommender

In [12]:
# 1. Download/load MovieLens
import os
import zipfile
import urllib.request
import pandas as pd

DATASET_URL = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
DATASET_ZIP = "ml-latest-small.zip"
DATASET_DIR = "ml-latest-small"

if not os.path.exists(DATASET_ZIP):
    urllib.request.urlretrieve(DATASET_URL, DATASET_ZIP)
if not os.path.exists(DATASET_DIR):
    with zipfile.ZipFile(DATASET_ZIP, 'r') as zip_ref:
        zip_ref.extractall(".")

ratings = pd.read_csv(f"{DATASET_DIR}/ratings.csv")
movies = pd.read_csv(f"{DATASET_DIR}/movies.csv")

# 2. Create user-item matrix
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print("Matrix shape:", user_item_matrix.shape)

# 3. Compute SVD using numpy/scipy
from scipy.sparse.linalg import svds
import numpy as np

user_ratings_mean = np.mean(user_item_matrix.values, axis=1)
R_demeaned = user_item_matrix.values - user_ratings_mean.reshape(-1, 1)

# Compute SVD
U, sigma, VT = svds(R_demeaned, k=20)
sigma = np.diag(sigma)


predicted_ratings = np.dot(np.dot(U, sigma), VT) + user_ratings_mean.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

#  Recommend top-5 movies for a user
def recommend_movies(pred_ratings_df, original_ratings, movies_df, user_id, n=5):
    user_row = pred_ratings_df.loc[user_id]
    # Movies the user has already rated
    rated_movies = set(original_ratings[original_ratings['userId']==user_id]['movieId'])
    # Recommend movies not yet rated
    recommendations = user_row.drop(labels=rated_movies).sort_values(ascending=False).head(n)
    return movies_df[movies_df['movieId'].isin(recommendations.index)][['movieId','title']]

print("\nTop 5 recommendations for user 1:")
print(recommend_movies(predicted_ratings_df, ratings, movies, user_id=1, n=5))

Matrix shape: (610, 9724)

Top 5 recommendations for user 1:
      movieId                              title
507       589  Terminator 2: Judgment Day (1991)
793      1036                    Die Hard (1988)
902      1200                      Aliens (1986)
1445     1968         Breakfast Club, The (1985)
2078     2762            Sixth Sense, The (1999)


A collaborative filtering model like SVD assumes that users who have shown similar preferences in the past will continue to have similar tastes in the future.

##Conceptual Analysis

### The Cold Start Problem in Recommender Systems

The "cold start problem" is the difficulty that the recommender system faces when they have to make recommendations involving new users or new items (such as movies) for which there is little or no historical data.

#### New User Cold Start
When a new user joins the platform, the system has no information about their preferences or past ratings. This makes it challenging for collaborative filtering models, which rely on user interaction data to find similar users and make predictions.

- Content-Based Model: Handles the new user cold start better if the user provides some initial preferences (e.g., selecting favorite genres or keywords), as recommendations can be made immediately based on item attributes.
- Collaborative Filtering Model (e.g., SVD): Performs poorly in this scenario because it needs user-item interaction data to generate recommendations.

#### New Movie Cold Start
When a new movie is added, there are no user ratings or interactions for that movie.

- Content-Based Model: Handles the new movie cold start well, as it can recommend the new movie to users if its content (e.g., genres, description) matches their profiles, even without any user ratings.
- Collaborative Filtering Model: Struggles with new movies, since it can’t recommend items that have no ratings or interaction history.


In [13]:

import pickle

with open('content_model.pkl', 'wb') as f:
    pickle.dump({
        'tfidf': tfidf,
        'cosine_sim': cosine_sim,
        'movies': movies
    }, f)

with open('cf_model.pkl', 'wb') as f:
    pickle.dump({
        'predicted_ratings_df': predicted_ratings_df,
        'movies': movies
    }, f)

In [14]:
from google.colab import files
files.download('content_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>