In [29]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
#Load and Merge Dataset
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

df = pd.merge(ratings, movies, on='movieId')

In [31]:
#Encode Users and Movies
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
df_small = df.copy()
df['user_idx'] = user_encoder.fit_transform(df['userId'])
df['movie_idx'] = movie_encoder.fit_transform(df['title'])

In [32]:
#Create Sparse Matrix
ratings_sparse = csr_matrix((df['rating'], (df['user_idx'], df['movie_idx'])))

In [33]:
#Apply SVD
n_components = 100
svd = TruncatedSVD(n_components=n_components, random_state=42)

user_factors = svd.fit_transform(ratings_sparse)
movie_factors = svd.components_.T # Transpose for later dot-product

In [34]:
import difflib

# Function to Recommend Similar Movies Based on Input Movie Name
def recommend_similar_movies(movie_name, movie_encoder, movie_factors, top_n=10):
    # Use fuzzy matching to get closest match
    all_titles = movie_encoder.classes_  # all known movie titles from encoder
    close_matches = difflib.get_close_matches(movie_name, all_titles, n=1, cutoff=0.5)

    if not close_matches:
        print(f"No close match found for '{movie_name}' in the dataset.")
        return pd.DataFrame()

    best_match = close_matches[0]
    print(f"Closest match found: '{best_match}'")

    try:
        movie_idx = movie_encoder.transform([best_match])[0]
    except:
        print(f"Matched movie '{best_match}' could not be encoded.")
        return pd.DataFrame()

    # Vector of the input movie
    input_vector = movie_factors[movie_idx].reshape(1, -1)

    # Cosine similarity with all movies
    similarities = cosine_similarity(input_vector, movie_factors).flatten()

    # Get top-N similar movie indices (excluding the input movie)
    similar_indices = similarities.argsort()[::-1][1:top_n + 1]
    similar_titles = movie_encoder.inverse_transform(similar_indices)
    similar_scores = similarities[similar_indices]

    # Return results
    return pd.DataFrame({
        'title': similar_titles,
        'similarity_score': similar_scores
    })


In [35]:
# Evaluate Model (Optional but good for validation)
from sklearn.model_selection import train_test_split

# Filter top users to reduce sparsity
top_users = df['userId'].value_counts().head(500).index
df_small = df[df['userId'].isin(top_users)]

# Re-encode after filtering
df_small.loc[:, 'user_idx'] = user_encoder.fit_transform(df_small['userId'])
df_small.loc[:, 'movie_idx'] = movie_encoder.fit_transform(df_small['title'])

# Train-test split
train_df, test_df = train_test_split(df_small, test_size=0.2, random_state=42)

# Train matrix
train_matrix = csr_matrix(
    (train_df['rating'], (train_df['user_idx'], train_df['movie_idx'])),
    shape=(df_small['user_idx'].nunique(), df_small['movie_idx'].nunique())
)

# Final SVD for recommendation based on all ratings
final_matrix = csr_matrix((df['rating'], (df['user_idx'], df['movie_idx'])))
svd_final = TruncatedSVD(n_components=100, random_state=42)
user_factors = svd_final.fit_transform(final_matrix)
movie_factors = svd_final.components_.T  # ✅ Now movie_factors is defined again

# Predict ratings for test set
predictions, actuals = [], []
for _, row in test_df.iterrows():
    u, m = row['user_idx'], row['movie_idx']
    if u < user_factors_eval.shape[0] and m < movie_factors_eval.shape[0]:
        pred = np.dot(user_factors_eval[u], movie_factors_eval[m])
        predictions.append(pred)
        actuals.append(row['rating'])

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(actuals, predictions))
mae = mean_absolute_error(actuals, predictions)
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 3.0758
MAE: 2.8555


In [36]:
input_movie = input("Enter a movie title: ")
recommendations = recommend_similar_movies(input_movie, movie_encoder, movie_factors, top_n=10)
print("\nTop similar movies:")
print(recommendations)

Enter a movie title:  Toy Story


Closest match found: 'Toy Story (1995)'

Top similar movies:
                                               title  similarity_score
0                                  Ghost Town (2008)          0.508477
1                     Capturing the Friedmans (2003)          0.494994
2                                House Arrest (1996)          0.465743
3            Scooby-Doo 2: Monsters Unleashed (2004)          0.448779
4                            House Bunny, The (2008)          0.442754
5                                    Stroszek (1977)          0.442634
6                       Children of the Night (1991)          0.442085
7                                   September (1987)          0.438661
8          Escape from the Planet of the Apes (1971)          0.427087
9  Return to Snowy River (a.k.a. The Man From Sno...          0.426393
