In [2]:
#!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m112.6/154.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505173 sha256=92cf3dcdce475fef123b66c11e0201c2fdbd3a7e21de8dc3d796972d2591c177
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score
from sklearn.preprocessing import MinMaxScaler
from surprise import SVD, Dataset, Reader
import ast
import torch
import pickle
import os
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Define project path
PROJECT_PATH = r"C:\Users\SOHAM\Movie Recommendation System"
DATA_PATH = os.path.join(PROJECT_PATH, "data")
MODELS_PATH = os.path.join(PROJECT_PATH, "models")

MOVIES_PATH = os.path.join(DATA_PATH, "movies_metadata.csv")
RATINGS_PATH = os.path.join(DATA_PATH, "ratings.csv")
CREDITS_PATH = os.path.join(DATA_PATH, "credits.csv")
KEYWORDS_PATH = os.path.join(DATA_PATH, "keywords.csv")

#  Load datasets 
movies = pd.read_csv(MOVIES_PATH, low_memory=False)
ratings = pd.read_csv(RATINGS_PATH)
credits = pd.read_csv(CREDITS_PATH)
keywords = pd.read_csv(KEYWORDS_PATH)

# Preprocessing

In [6]:
# Preprocess movies_metadata.csv
movies = movies[['id', 'title', 'genres', 'release_date', 'popularity']].copy()
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies['popularity'] = pd.to_numeric(movies['popularity'], errors='coerce')
movies = movies.dropna(subset=['id', 'popularity'])

# Process genres
movies['genres'] = movies['genres'].fillna('[]').apply(
    lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) and x.startswith('[') else []
)

# Extract release year
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['release_year'] = movies['release_date'].dt.year

# Ensure consistent data type for 'id'
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')

# Drop rows with NaN values in 'id'
movies = movies.dropna(subset=['id']).reset_index(drop=True)
credits = credits.dropna(subset=['id']).reset_index(drop=True)
keywords = keywords.dropna(subset=['id']).reset_index(drop=True)

  movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')


In [7]:
# Process credits.csv
def extract_top_cast(cast_data, top_n=3):
    try:
        cast = ast.literal_eval(cast_data)
        return ', '.join([c['name'] for c in cast[:top_n]])
    except:
        return ''  # Return empty string to avoid KeyError

def extract_directors(crew_data):
    try:
        crew = ast.literal_eval(crew_data)
        return ', '.join([c['name'] for c in crew if c['job'] == 'Director'])
    except:
        return ''  # Return empty string to avoid KeyError

credits['top_cast'] = credits['cast'].apply(lambda x: extract_top_cast(x, top_n=3))
credits['director'] = credits['crew'].apply(extract_directors)
credits = credits[['id', 'top_cast', 'director']]

In [8]:
# Process keywords.csv
def extract_keywords(keywords_data, top_n=10):
    try:
        if pd.isna(keywords_data) or keywords_data == '':
            return ''
        keywords_list = ast.literal_eval(keywords_data)
        return ', '.join([k['name'] for k in keywords_list[:top_n]])
    except:
        return ''

keywords['keywords_str'] = keywords['keywords'].apply(lambda x: extract_keywords(x, top_n=10))
keywords = keywords[['id', 'keywords_str']]


In [9]:
# Merge datasets
movies = movies.merge(credits, left_on='id', right_on='id', how='left')
movies = movies.merge(keywords, left_on='id', right_on='id', how='left')

# Replace NaN values with empty strings
movies = movies.fillna('')

# Create combined features for embeddings
movies['genres_str'] = movies['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
movies['combined_features'] = (
    movies['genres_str'] + ', ' +
    movies['top_cast'] + ', ' +
    movies['director'] + ', ' +
    movies['keywords_str']
)

# Process ratings.csv
ratings = ratings.drop_duplicates().dropna()
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)

# Filter for valid movieIds
valid_movie_ids = movies['id'].unique()
ratings = ratings[ratings['movieId'].isin(valid_movie_ids)]

# Merge with ratings
movies_ratings = ratings.merge(
    movies[['id', 'title', 'genres_str', 'popularity', 'release_year', 'combined_features']],
    left_on='movieId',
    right_on='id',
    how='inner'
)

# Final columns for training
movies_ratings = movies_ratings[['userId', 'movieId', 'rating', 'title', 'genres_str', 'popularity', 'release_year', 'combined_features']].copy()


In [10]:
movies_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres_str,popularity,release_year,combined_features
0,1,110,1.0,Three Colors: Red,"Drama, Mystery, Romance",7.832755,1994.0,"Drama, Mystery, Romance, Irène Jacob, Jean-Lou..."
1,1,147,4.5,The 400 Blows,Drama,7.268688,1959.0,"Drama, Jean-Pierre Léaud, Claire Maurier, Albe..."
2,1,858,5.0,Sleepless in Seattle,"Comedy, Drama, Romance",10.234919,1993.0,"Comedy, Drama, Romance, Tom Hanks, Meg Ryan, B..."
3,1,1246,5.0,Rocky Balboa,Drama,11.697604,2006.0,"Drama, Sylvester Stallone, Burt Young, Milo Ve..."
4,1,1968,4.0,Fools Rush In,"Drama, Comedy, Romance",6.285574,1997.0,"Drama, Comedy, Romance, Matthew Perry, Salma H..."


In [11]:
# Reduce dataset size for efficient training
# movies_ratings = movies_ratings.sample(n=80000, random_state=42).reset_index(drop=True)

# Normalize ratings
scaler = MinMaxScaler()
movies_ratings['rating_scaled'] = scaler.fit_transform(movies_ratings[['rating']])

# Map user and movie IDs
user_ids = movies_ratings['userId'].unique()
movie_ids = movies_ratings['movieId'].unique()

user_to_index = {user: i for i, user in enumerate(user_ids)}
movie_to_index = {movie: i for i, movie in enumerate(movie_ids)}

movies_ratings['user_idx'] = movies_ratings['userId'].map(user_to_index)
movies_ratings['movie_idx'] = movies_ratings['movieId'].map(movie_to_index)


In [12]:
# Load Hugging Face model
MODEL_PATH = os.path.join(MODELS_PATH, "sentence-transformers_all-MiniLM-L6-v2")

if os.path.exists(MODEL_PATH):
    print("✅ Loading Hugging Face Model from Drive...")
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
else:
    print("🚀 Downloading Hugging Face Model...")
    MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
    model = AutoModel.from_pretrained(MODEL_NAME)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model.save_pretrained(MODEL_PATH)
    tokenizer.save_pretrained(MODEL_PATH)
    print(f"✅ Model saved at: {MODEL_PATH}")



✅ Loading Hugging Face Model from Drive...


In [13]:
# Generate embeddings in batches
def generate_embeddings_batch(texts, batch_size=128):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

movies['embeddings'] = generate_embeddings_batch(movies['combined_features'].tolist())

# Save embeddings
MOVIES_EMBEDDINGS_PATH = os.path.join(MODELS_PATH, "movies_with_embeddings.csv")
movies.to_csv(MOVIES_EMBEDDINGS_PATH, index=False)


In [14]:

# Prepare movie embeddings matrix for similarity computation
movie_embeddings_matrix = np.vstack(movies['embeddings'].values)

# Define recommendation functions
def recommend_movies(query, top_n=5):
    query_embedding = generate_embeddings_batch([query])[0].reshape(1, -1)
    similarities = cosine_similarity(query_embedding, movie_embeddings_matrix)[0]
    movies['similarity'] = similarities
    recommendations = movies.sort_values(by='similarity', ascending=False).head(top_n)
    return recommendations[['title', 'similarity']]


In [15]:
def recommend_movies_svd(user_id, movies_ratings, top_n=5):
    user_rated_movies = movies_ratings[movies_ratings['userId'] == user_id]['movieId'].tolist()
    all_movie_ids = movies_ratings['movieId'].unique()
    unrated_movies = [movie for movie in all_movie_ids if movie not in user_rated_movies]
    predictions = [(movie, svd.predict(user_id, movie).est) for movie in unrated_movies]
    top_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]
    recommended_movies = pd.DataFrame({
        'movieId': [movie for movie, _ in top_predictions],
        'predicted_rating': [rating for _, rating in top_predictions]
    })
    recommended_movies = recommended_movies.merge(
        movies_ratings[['movieId', 'title', 'genres_str', 'release_year']].drop_duplicates(),
        on='movieId'
    )
    return recommended_movies[['movieId', 'title', 'predicted_rating', 'release_year']].head(top_n)


In [16]:
def hybrid_recommendation(user_id, genre, movies_ratings, top_n=5):
    svd_recommendations = recommend_movies_svd(user_id, movies_ratings, top_n * 2)
    genre_movies = movies_ratings[movies_ratings['genres_str'].str.contains(genre, case=False)]
    genre_movies = genre_movies.drop_duplicates(subset='movieId')
    top_genre_movies = genre_movies.sort_values(by='popularity', ascending=False).head(top_n * 2)
    combined = pd.merge(svd_recommendations, top_genre_movies[['title', 'popularity']], on='title', how='outer')
    combined['predicted_rating'] = combined['predicted_rating'].fillna(0)
    combined['popularity'] = combined['popularity'].fillna(0)
    combined['hybrid_score'] = combined['predicted_rating'] * 0.7 + combined['popularity'] * 0.3
    return combined.sort_values(by='hybrid_score', ascending=False).head(top_n)


# Model Train

In [17]:
# Train Neural Network Model
embedding_dim = 20
user_input = layers.Input(shape=(1,), name='user_input')
user_embedding = layers.Embedding(input_dim=len(user_ids), output_dim=embedding_dim)(user_input)
user_vector = layers.Flatten()(user_embedding)

movie_input = layers.Input(shape=(1,), name='movie_input')
movie_embedding = layers.Embedding(input_dim=len(movie_ids), output_dim=embedding_dim)(movie_input)
movie_vector = layers.Flatten()(movie_embedding)

concatenated = layers.Concatenate()([user_vector, movie_vector])
dense_layer_1 = layers.Dense(256, activation='relu')(concatenated)
dropout_1 = layers.Dropout(0.3)(dense_layer_1)
output = layers.Dense(1, activation='linear')(dropout_1)

nn_model = Model(inputs=[user_input, movie_input], outputs=output)
nn_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

train, test = train_test_split(movies_ratings, test_size=0.2, random_state=42)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

nn_model.fit([train['user_idx'], train['movie_idx']], train['rating_scaled'],
             validation_data=([test['user_idx'], test['movie_idx']], test['rating_scaled']),
             epochs=15, batch_size=256, callbacks=[early_stopping])

Epoch 1/15
[1m35781/35781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1536s[0m 43ms/step - loss: 0.0435 - mae: 0.1603 - val_loss: 0.0365 - val_mae: 0.1458
Epoch 2/15
[1m35781/35781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1565s[0m 44ms/step - loss: 0.0349 - mae: 0.1425 - val_loss: 0.0352 - val_mae: 0.1440
Epoch 3/15
[1m35781/35781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1622s[0m 45ms/step - loss: 0.0323 - mae: 0.1361 - val_loss: 0.0343 - val_mae: 0.1408
Epoch 4/15
[1m35781/35781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1618s[0m 45ms/step - loss: 0.0305 - mae: 0.1316 - val_loss: 0.0343 - val_mae: 0.1403
Epoch 5/15
[1m35781/35781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1622s[0m 45ms/step - loss: 0.0292 - mae: 0.1282 - val_loss: 0.0340 - val_mae: 0.1401
Epoch 6/15
[1m35781/35781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1505s[0m 42ms/step - loss: 0.0281 - mae: 0.1254 - val_loss: 0.0341 - val_mae: 0.1407
Epoch 7/15
[1m35781/35781[0m [32m━━━━

<keras.src.callbacks.history.History at 0x1e5143f89a0>

In [18]:
# Save Neural Network Model
NN_MODEL_PATH = os.path.join(MODELS_PATH, "movie_recommendation_model.keras")
nn_model.save(NN_MODEL_PATH)


In [19]:
# Train and Save SVD Model
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(movies_ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e513c183d0>

In [20]:
# Save the SVD Model
SVD_MODEL_PATH = os.path.join(MODELS_PATH, "svd_model.pkl")
with open(SVD_MODEL_PATH, 'wb') as f:
    pickle.dump(svd, f)

print("SVD Model Saved Successfully!")

SVD Model Saved Successfully!


In [21]:
#################################################################################################################################
#  Evaluation and Metrics - Nueral Network
#################################################################################################################################

# Evaluate Neural Network Model
nn_predictions = nn_model.predict([test['user_idx'], test['movie_idx']])
nn_predictions_rescaled = scaler.inverse_transform(nn_predictions)
nn_predictions_rounded = np.clip(np.round(nn_predictions_rescaled * 2) / 2, 0.5, 5.0)
true_ratings = test['rating']

mse_nn = mean_squared_error(true_ratings, nn_predictions_rescaled)
mae_nn = mean_absolute_error(true_ratings, nn_predictions_rescaled)
accuracy_nn = np.mean(np.abs(nn_predictions_rounded.flatten() - true_ratings) <= 0.5)
binary_true = (true_ratings >= 4.0).astype(int)
binary_predicted = (nn_predictions_rounded.flatten() >= 4.0).astype(int)
precision_nn = precision_score(binary_true, binary_predicted, zero_division=1)

print(f"✅ Neural Network Model Evaluation:")
print(f"📌 MSE: {mse_nn:.4f}, MAE: {mae_nn:.4f}, Accuracy: {accuracy_nn:.4f}, Precision: {precision_nn:.4f}")


[1m71562/71562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 2ms/step
✅ Neural Network Model Evaluation:
📌 MSE: 0.6895, MAE: 0.6303, Accuracy: 0.6790, Precision: 0.7741


In [22]:
#################################################################################################################################
#  Evaluation and Metrics - SVD
#################################################################################################################################

# Evaluate SVD Model
svd_predictions = [svd.predict(uid, mid).est for uid, mid in zip(test['userId'], test['movieId'])]
mse_svd = mean_squared_error(true_ratings, svd_predictions)
mae_svd = mean_absolute_error(true_ratings, svd_predictions)
svd_predictions_rounded = np.clip(np.round(np.array(svd_predictions) * 2) / 2, 0.5, 5.0)
accuracy_svd = np.mean(np.abs(svd_predictions_rounded - true_ratings) <= 0.5)
binary_predicted_svd = (svd_predictions_rounded >= 4.0).astype(int)
precision_svd = precision_score(binary_true, binary_predicted_svd, zero_division=1)

print(f"✅ SVD Model Evaluation:")
print(f"📌 MSE: {mse_svd:.4f}, MAE: {mae_svd:.4f}, Accuracy: {accuracy_svd:.4f}, Precision: {precision_svd:.4f}")


✅ SVD Model Evaluation:
📌 MSE: 0.4640, MAE: 0.5234, Accuracy: 0.7576, Precision: 0.8459


In [23]:
# Test Recommendations
test_user_id = user_ids[5]
print("\n🚀 SVD Recommendations:")
print(recommend_movies_svd(test_user_id, movies_ratings, top_n=5))

print("\n🚀 Hybrid Recommendations:")
print(hybrid_recommendation(test_user_id, 'Thriller', movies_ratings, top_n=5))


🚀 SVD Recommendations:
   movieId                             title  predicted_rating release_year
0      318          The Million Dollar Hotel          4.517190       2000.0
1    44555  A Woman, a Gun and a Noodle Shop          4.418758       2009.0
2      527                Once Were Warriors          4.331689       1994.0
3     2324                       Local Color          4.299826       2006.0
4    42783                   Kissin' Cousins          4.292615       1964.0

🚀 Hybrid Recommendations:
    movieId                           title  predicted_rating release_year  \
12      NaN                    Pulp Fiction               0.0          NaN   
14      NaN                 The Dark Knight               0.0          NaN   
2       NaN                    Blade Runner               0.0          NaN   
3       NaN  Dawn of the Planet of the Apes               0.0          NaN   
11      NaN                          Psycho               0.0          NaN   

    popularity  hybrid_s

In [24]:
query = "A space adventure with aliens"
print("\n🚀 Query-Based Recommendations:")
print(recommend_movies(query, top_n=5))


🚀 Query-Based Recommendations:
                         title  similarity
21502       The Second Arrival    0.591429
8122   AVP: Alien vs. Predator    0.568284
16810                     Paul    0.564755
16901          Mars Needs Moms    0.563919
44154   Species: The Awakening    0.561503
