# Movie Recommendation System
This notebook demonstrates how to build a hybrid movie recommendation system using collaborative filtering and content-based filtering techniques.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


## Load and Preview Data

In [None]:
# Load ratings and metadata files
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies_metadata.csv', low_memory=False)
keywords = pd.read_csv('keywords.csv')
credits = pd.read_csv('credits.csv')

# Preview datasets
ratings.head()


## Data Cleaning

In [None]:
# Convert movieId to numeric, dropping invalid rows
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)

# Merge datasets
movies = movies[['id', 'title', 'overview']]
ratings['movieId'] = ratings['movieId'].astype(int)
merged = pd.merge(ratings, movies, left_on='movieId', right_on='id')
merged.dropna(subset=['overview'], inplace=True)

merged.head()


## Collaborative Filtering with SVD

In [None]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

model = SVD()
model.fit(trainset)
predictions = model.test(testset)

from surprise import accuracy
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")


## Content-Based Filtering with TF-IDF

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
movies['overview'] = movies['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['overview'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

get_recommendations('The Dark Knight')


## Hybrid Recommendation System

In [None]:
def hybrid_recommendation(user_id, title, model, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movie_ids = movies['id'].iloc[movie_indices]
    titles = movies['title'].iloc[movie_indices]

    hybrid_scores = []
    for mid, t in zip(movie_ids, titles):
        est = model.predict(user_id, int(mid)).est
        content_sim = cosine_sim[idx][movies.index[movies['id'] == mid].tolist()[0]]
        score = (0.7 * est) + (0.3 * content_sim)
        hybrid_scores.append((t, score))

    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)
    return [x[0] for x in hybrid_scores[:10]]

hybrid_recommendation(1, 'The Dark Knight', model)


## Visualizations

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(ratings['rating'], bins=20, kde=True, color='steelblue')
plt.title('User Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()


In [None]:
popular_movies = ratings.groupby('movieId').size().sort_values(ascending=False).head(10)
popular_movie_titles = movies[movies['id'].isin(popular_movies.index)][['id', 'title']]
popular_movie_titles['count'] = popular_movies.values

plt.figure(figsize=(10,5))
sns.barplot(x='count', y='title', data=popular_movie_titles, palette='muted')
plt.title('Top 10 Most Rated Movies')
plt.xlabel('Rating Count')
plt.ylabel('Movie Title')
plt.show()
