In [7]:
# Install required libraries
!pip install scikit-learn pandas numpy matplotlib seaborn -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# 1. LOAD MOVIELENS 100K DATASET
# ============================================================================

# Download and extract the dataset
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

# Load the datasets
movies_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols)

# For content-based filtering, we'll use a simplified genre list and the title
movies['genre'] = movies.apply(lambda row: ' '.join([col for col in movies_cols[6:] if row[col] == 1]), axis=1)
movies = movies[['movie_id', 'title', 'genre']]


class CollaborativeFiltering:
    def __init__(self, ratings_df):
        self.ratings_df = ratings_df
        self.user_item_matrix = None
        self.user_similarity = None

    def create_user_item_matrix(self):
        self.user_item_matrix = self.ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)
        return self.user_item_matrix

    def compute_similarity(self):
        self.user_similarity = cosine_similarity(self.user_item_matrix)
        self.user_similarity = pd.DataFrame(self.user_similarity, index=self.user_item_matrix.index, columns=self.user_item_matrix.index)
        return self.user_similarity

    def recommend(self, user_id, n_recommendations=5):
        if user_id not in self.user_item_matrix.index:
            return []
        similar_users = self.user_similarity[user_id].sort_values(ascending=False)[1:11]
        user_movies = set(self.ratings_df[self.ratings_df['user_id'] == user_id]['movie_id'])
        recommendations = {}
        for similar_user_id, similarity_score in similar_users.items():
            similar_user_movies = self.ratings_df[self.ratings_df['user_id'] == similar_user_id]
            for _, row in similar_user_movies.iterrows():
                if row['movie_id'] not in user_movies:
                    if row['movie_id'] not in recommendations:
                        recommendations[row['movie_id']] = 0
                    recommendations[row['movie_id']] += similarity_score * row['rating']
        recommended_movies = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
        return [movie_id for movie_id, _ in recommended_movies]

class ContentBasedFiltering:
    def __init__(self, movies_df):
        self.movies_df = movies_df
        self.tfidf_matrix = None
        self.cosine_sim = None

    def create_feature_matrix(self):
        self.movies_df['features'] = self.movies_df['genre'] + ' ' + self.movies_df['title']
        tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = tfidf.fit_transform(self.movies_df['features'])
        self.cosine_sim = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix)
        return self.cosine_sim

    def recommend(self, movie_title, n_recommendations=5):
        if movie_title not in self.movies_df['title'].values:
            return []
        idx = self.movies_df[self.movies_df['title'] == movie_title].index[0]
        sim_scores = list(enumerate(self.cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n_recommendations+1]
        movie_indices = [i[0] for i in sim_scores]
        return self.movies_df.iloc[movie_indices]['title'].tolist()

class MatrixFactorization:
    def __init__(self, ratings_df, n_components=10):
        self.ratings_df = ratings_df
        self.n_components = n_components
        self.user_item_matrix = None
        self.svd = None
        self.predicted_ratings = None

    def train(self):
        self.user_item_matrix = self.ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)
        self.svd = TruncatedSVD(n_components=self.n_components, random_state=42)
        matrix_transformed = self.svd.fit_transform(self.user_item_matrix)
        self.predicted_ratings = np.dot(matrix_transformed, self.svd.components_)
        self.predicted_ratings = pd.DataFrame(self.predicted_ratings, index=self.user_item_matrix.index, columns=self.user_item_matrix.columns)
        return self.predicted_ratings

    def recommend(self, user_id, n_recommendations=5):
        if user_id not in self.predicted_ratings.index:
            return []
        user_ratings = self.predicted_ratings.loc[user_id]
        rated_movies = set(self.ratings_df[self.ratings_df['user_id'] == user_id]['movie_id'])
        recommendations = user_ratings.drop(rated_movies, errors='ignore').sort_values(ascending=False)
        return recommendations.head(n_recommendations).index.tolist()

class HybridRecommender:
    def __init__(self, cf_recommender, cb_recommender, weight_cf=0.6):
        self.cf_recommender = cf_recommender
        self.cb_recommender = cb_recommender
        self.weight_cf = weight_cf
        self.weight_cb = 1 - weight_cf

    def recommend(self, user_id, user_favorite_movie, n_recommendations=5):
        cf_recs = self.cf_recommender.recommend(user_id, n_recommendations * 2)
        cb_recs_titles = self.cb_recommender.recommend(user_favorite_movie, n_recommendations * 2)
        cb_movie_ids = movies[movies['title'].isin(cb_recs_titles)]['movie_id'].tolist()

        recommendations = {}
        for i, movie_id in enumerate(cf_recs):
            recommendations[movie_id] = self.weight_cf * (len(cf_recs) - i)

        for i, movie_id in enumerate(cb_movie_ids):
            if movie_id in recommendations:
                recommendations[movie_id] += self.weight_cb * (len(cb_movie_ids) - i)
            else:
                recommendations[movie_id] = self.weight_cb * (len(cb_movie_ids) - i)

        sorted_recs = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
        return [movie_id for movie_id, _ in sorted_recs]

# Initialize and train the recommenders with the MovieLens data
cf = CollaborativeFiltering(ratings)
cf.create_user_item_matrix()
cf.compute_similarity()

cb = ContentBasedFiltering(movies)
cb.create_feature_matrix()

mf = MatrixFactorization(ratings, n_components=5)
mf.train()

hybrid = HybridRecommender(cf, cb, weight_cf=0.6)

# Example recommendations
test_user = 1
test_movie = 'Toy Story (1995)' # Using a movie title from the MovieLens dataset

cf_recommendations = cf.recommend(test_user, n_recommendations=5)
cb_recommendations = cb.recommend(test_movie, n_recommendations=5)
mf_recommendations = mf.recommend(test_user, n_recommendations=5)
hybrid_recommendations = hybrid.recommend(test_user, test_movie, n_recommendations=5)

print('Collaborative Filtering Recommendations:', cf_recommendations)
print('Content-Based Recommendations:', cb_recommendations)
print('Matrix Factorization Recommendations:', mf_recommendations)
print('Hybrid Recommendations:', hybrid_recommendations)

--2025-11-05 12:52:37--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.grouplens.org/datasets/movielens/ml-100k.zip [following]
--2025-11-05 12:52:38--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-11-05 12:52:38 (9.04 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100

In [8]:
# Evaluate Collaborative Filtering
cf_recommendations = cf.recommend(test_user, n_recommendations=5)
print(f'Collaborative Filtering Recommendations for user {test_user}: {cf_recommendations}')

Collaborative Filtering Recommendations for user 1: [np.int64(318), np.int64(474), np.int64(655), np.int64(423), np.int64(403)]


In [9]:
# Evaluate Content-Based Filtering
cb_recommendations = cb.recommend(test_movie, n_recommendations=5)
print(f'Content-Based Recommendations for movie "{test_movie}": {cb_recommendations}')

Content-Based Recommendations for movie "Toy Story (1995)": ["Pyromaniac's Love Story, A (1995)", 'Balto (1995)', 'Goofy Movie, A (1995)', 'NeverEnding Story III, The (1994)', 'Pocahontas (1995)']


In [10]:
# Evaluate Matrix Factorization
mf_recommendations = mf.recommend(test_user, n_recommendations=5)
print(f'Matrix Factorization Recommendations for user {test_user}: {mf_recommendations}')

Matrix Factorization Recommendations for user 1: [475, 357, 273, 318, 276]


In [11]:
# Evaluate Hybrid Recommender
hybrid_recommendations = hybrid.recommend(test_user, test_movie, n_recommendations=5)
print(f'Hybrid Recommendations for user {test_user} based on favorite movie "{test_movie}": {hybrid_recommendations}')

Hybrid Recommendations for user 1 based on favorite movie "Toy Story (1995)": [np.int64(318), np.int64(474), np.int64(655), np.int64(423), 95]
