In [18]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m115.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m110.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hIn

In [19]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
from google.colab import files
uploaded = files.upload()

Saving ratings.csv to ratings.csv
Saving movies.csv to movies.csv


In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
print("Movies Data:")
display(movies.head())

print("Ratings Data:")
display(ratings.head())

Movies Data:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Ratings Data:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print("Movies Info:")
print(movies.info())
print("\nMissing Values:\n", movies.isnull().sum())

print("\nRatings Info:")
print(ratings.info())
print("\nMissing Values:\n", ratings.isnull().sum())

Movies Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None

Missing Values:
 movieId    0
title      0
genres     0
dtype: int64

Ratings Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None

Missing Values:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [5]:
movies['genres'] = movies['genres'].str.replace('|', ' ', regex=False)

tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(movies['genres'])

print("TF-IDF Matrix Shape :\n", tfidf_matrix.shape)


TF-IDF Matrix Shape :
 (9742, 23)


In [6]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print("Cosine Similarity Matrix Shape:\n", cosine_sim.shape)


Cosine Similarity Matrix Shape:
 (9742, 9742)


In [7]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]


In [8]:
get_recommendations('Toy Story (1995)')

Unnamed: 0,title
1706,Antz (1998)
2355,Toy Story 2 (1999)
2809,"Adventures of Rocky and Bullwinkle, The (2000)"
3000,"Emperor's New Groove, The (2000)"
3568,"Monsters, Inc. (2001)"
6194,"Wild, The (2006)"
6486,Shrek the Third (2007)
6948,"Tale of Despereaux, The (2008)"
7760,Asterix and the Vikings (Astérix et les Viking...
8219,Turbo (2013)


In [9]:
R_df = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
R = R_df.values

In [10]:
from scipy.sparse.linalg import svds

k = 20
U, sigma, Vt = svds(R, k=k)
sigma = np.diag(sigma)


In [11]:
R_pred = np.dot(np.dot(U, sigma), Vt)
R_pred_df = pd.DataFrame(R_pred, index=R_df.index, columns=R_df.columns)


In [12]:
def recommend_movies_collaborative(user_id, R_pred_df, movies, num_recommendations=10):
    user_ratings = R_pred_df.loc[user_id]

    user_seen = ratings[ratings['userId'] == user_id]['movieId'].tolist()

    recommendations = user_ratings.drop(user_seen).sort_values(ascending=False).head(num_recommendations)

    recommended_movies = movies[movies['movieId'].isin(recommendations.index)]
    recommended_movies = recommended_movies.assign(predicted_rating=recommendations.values)

    return recommended_movies[['movieId', 'title', 'predicted_rating']]


In [13]:
user_id = 1
recommendations = recommend_movies_collaborative(user_id, R_pred_df, movies, num_recommendations=10)
print(recommendations)


      movieId                                      title  predicted_rating
31         32  Twelve Monkeys (a.k.a. 12 Monkeys) (1995)          3.931090
507       589          Terminator 2: Judgment Day (1991)          3.725029
659       858                      Godfather, The (1972)          3.334133
793      1036                            Die Hard (1988)          3.317304
902      1200                              Aliens (1986)          3.160155
958      1259                         Stand by Me (1986)          3.046347
1067     1387                                Jaws (1975)          3.020628
1445     1968                 Breakfast Club, The (1985)          2.895730
2078     2762                    Sixth Sense, The (1999)          2.874779
2195     2918            Ferris Bueller's Day Off (1986)          2.719621


In [14]:

def recommend_movies_content_based(user_id, ratings, cosine_sim_matrix, movies, top_n=10):
    user_ratings = ratings[ratings['userId'] == user_id]


    sim_scores = np.zeros(cosine_sim_matrix.shape[0])

    for _, row in user_ratings.iterrows():
        movie_idx = movies[movies['movieId'] == row['movieId']].index[0]
        sim_scores += cosine_sim_matrix[movie_idx] * row['rating']

    sim_scores /= user_ratings['rating'].sum()

    rated_movie_indices = user_ratings['movieId'].apply(lambda x: movies[movies['movieId'] == x].index[0])

    sim_scores[rated_movie_indices] = 0

    recommended_indices = np.argsort(sim_scores)[::-1][:top_n]

    recommended_movies = movies.iloc[recommended_indices][['movieId', 'title']].copy()
    recommended_movies['content_score'] = sim_scores[recommended_indices]

    return recommended_movies


In [15]:
def hybrid_recommendations(user_id, ratings, cosine_sim_matrix, movies, R_pred_df, top_n=10, alpha=0.6):
    content_recs = recommend_movies_content_based(user_id, ratings, cosine_sim_matrix, movies, top_n=top_n*2)

    collab_recs = recommend_movies_collaborative(user_id, R_pred_df, movies, num_recommendations=top_n*2)


    content_scores = content_recs['content_score'].values
    content_scores = (content_scores - content_scores.min()) / (content_scores.max() - content_scores.min())
    content_recs['norm_content_score'] = content_scores

    collab_scores = collab_recs['predicted_rating'].values
    collab_scores = (collab_scores - collab_scores.min()) / (collab_scores.max() - collab_scores.min())
    collab_recs['norm_collab_score'] = collab_scores

    combined = pd.merge(content_recs, collab_recs, on=['movieId', 'title'], how='outer')

    combined['norm_content_score'] = combined['norm_content_score'].fillna(0)
    combined['norm_collab_score'] = combined['norm_collab_score'].fillna(0)

    combined['hybrid_score'] = alpha * combined['norm_collab_score'] + (1 - alpha) * combined['norm_content_score']

    combined = combined.sort_values('hybrid_score', ascending=False).head(top_n)

    return combined[['movieId', 'title', 'hybrid_score']]

In [16]:
hybrid_recs = hybrid_recommendations(1, ratings, cosine_sim, movies, R_pred_df, top_n=10, alpha=0.6)
print(hybrid_recs)

    movieId                                      title  hybrid_score
0        32  Twelve Monkeys (a.k.a. 12 Monkeys) (1995)      0.600000
2       541                        Blade Runner (1982)      0.519725
38   117646      Dragonheart 2: A New Beginning (2000)      0.400000
4       589          Terminator 2: Judgment Day (1991)      0.367446
5       858                      Godfather, The (1972)      0.360890
6       924               2001: A Space Odyssey (1968)      0.299670
34    55116                  Hunting Party, The (2007)      0.294147
28     5657                           Flashback (1990)      0.290761
30     6990             The Great Train Robbery (1978)      0.290761
7      1036                            Die Hard (1988)      0.255334
