# Recommendation Model Mini Project

In [1]:
import zipfile
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
import ssl, certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())


In [4]:
print('Downloading MovieLens 25M …')
urlretrieve('https://files.grouplens.org/datasets/movielens/ml-25m.zip', 'ml-25m.zip')
z = zipfile.ZipFile('ml-25m.zip', 'r')
z.extractall()
print('Done.')


Downloading MovieLens 25M …
Done.


In [5]:
movies = pd.read_csv('ml-25m/movies.csv')   # movieId,title,genres
ratings = pd.read_csv('ml-25m/ratings.csv') # userId,movieId,rating,timestamp
print('movies:', movies.shape, 'ratings:', ratings.shape)


movies: (62423, 3) ratings: (25000095, 4)


In [6]:
merged = ratings.merge(movies[['movieId','title','genres']], on='movieId', how='left')
df_min = merged[['userId','title','rating']].rename(columns={'userId':'user_id'})

movie_counts = merged['movieId'].value_counts()
user_counts  = merged['userId'].value_counts()
popular_movies = set(movie_counts[movie_counts >= 200].index)
active_users   = set(user_counts[user_counts >= 200].index)
small = merged[merged['movieId'].isin(popular_movies) & merged['userId'].isin(active_users)]

top_users  = small['userId'].value_counts().head(3000).index
top_movies = small['movieId'].value_counts().head(3000).index
small = small[small['userId'].isin(top_users) & small['movieId'].isin(top_movies)]
df_small = small[['userId','title','rating']].rename(columns={'userId':'user_id'})

print('df_min:', df_min.shape, 'df_small:', df_small.shape)


df_min: (25000095, 3) df_small: (3017584, 3)


In [7]:
movies_cb = movies.copy()
movies_cb['genres'] = movies_cb['genres'].fillna('(no genres listed)')
movies_cb['genres_str'] = movies_cb['genres'].str.replace('|',' ', regex=False).str.lower()

tfidf = TfidfVectorizer(token_pattern=r'[^\s]+')
tfidf_matrix = tfidf.fit_transform(movies_cb['genres_str'])

genre_sim = cosine_similarity(tfidf_matrix)

titles = movies_cb['title'].tolist()
title_to_idx = {t:i for i,t in enumerate(titles)}


In [8]:
def content_based_recommendation(user_id, df, top_k=5):
    user_df = df[df['user_id'] == user_id]
    if user_df.empty:
        return []
    rated = set(user_df['title'].tolist())
    scores = np.zeros(len(titles), dtype=np.float64)
    for t in rated:
        idx = title_to_idx.get(t)
        if idx is not None:
            scores += genre_sim[idx]
    for t in rated:
        idx = title_to_idx.get(t)
        if idx is not None:
            scores[idx] = -1.0
    order = np.argsort(scores)[::-1]
    recs = []
    for i in order:
        if scores[i] <= 0:
            continue
        title = titles[i]
        if title not in rated:
            recs.append(title)
        if len(recs) == top_k:
            break
    return recs


In [9]:
pivot = df_small.pivot_table(index='user_id', columns='title', values='rating')
mat = pivot.fillna(0.0).values
user_sim = cosine_similarity(mat)
uidx = {u:i for i,u in enumerate(pivot.index.tolist())}


In [10]:
def collaborative_filtering_recommendation(user_id, top_neighbors=25, top_k=10):
    if user_id not in uidx:
        return []
    i = uidx[user_id]
    sims = user_sim[i].copy()
    sims[i] = -1.0
    neigh_idx = np.argsort(sims)[::-1][:top_neighbors]
    neigh_users = [pivot.index[j] for j in neigh_idx]
    target = pivot.loc[user_id]
    unseen = target.isna()
    mean_scores = pivot.loc[neigh_users].mean(axis=0)
    candidates = mean_scores[unseen].sort_values(ascending=False)
    return candidates.index.tolist()[:top_k]


In [11]:
test_users = df_small['user_id'].drop_duplicates().sample(3, random_state=7).tolist()
for uid in test_users:
    print('\nUSER:', uid)
    print('Content-based:', content_based_recommendation(uid, df_min))
    print('Collaborative:', collaborative_filtering_recommendation(uid))



USER: 70271
Content-based: ['Once Upon a Time (2008)', 'Stunt Man, The (1980)', 'Joseph Andrews (1977)', 'Red Peony Gambler (1968)', 'Longshot (2001)']
Collaborative: ['Planet Earth (2006)', 'There Will Be Blood (2007)', 'Aguirre: The Wrath of God (Aguirre, der Zorn Gottes) (1972)', 'Fanny and Alexander (Fanny och Alexander) (1982)', 'Cook the Thief His Wife & Her Lover, The (1989)', 'Paths of Glory (1957)', 'Solaris (Solyaris) (1972)', 'Goodfellas (1990)', 'Touch of Evil (1958)', 'Manhattan (1979)']

USER: 109090
Content-based: ['Bummer (1973)', "Pusher III: I'm the Angel of Death (2005)", 'A Bloody Aria (2006)', 'Orca: The Killer Whale (1977)', 'Trigger Man (2007)']
Collaborative: ['Wallace & Gromit: The Best of Aardman Animation (1996)', 'Ran (1985)', 'Thin Man, The (1934)', 'Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)', 'Lives of Others, The (Das leben der Anderen) (2006)', 'Sting, The (1973)', 'Bridge on the River Kwai, The (1957)', "Amelie (Fabuleux