In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np


In [2]:
data = pd.read_csv("movies.csv")

In [3]:
df = pd.DataFrame(data)

In [5]:
print(df.columns)

Index(['movieId', 'title', 'genres'], dtype='object')


In [6]:
# Removing Duplicates
data.drop_duplicates(inplace=True)

In [20]:
# Understanding the data for cleaning and pre-processing
genres = df['genres'].unique()

# print("Genres:")
# print(genres)

In [14]:
all_genres = df['genres'].str.split('|').explode().unique()

# Print the unique genres
print("Distinct Genres:")
print(all_genres)

Distinct Genres:
['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'IMAX' 'War'
 'Musical' 'Documentary' 'Western' 'Film-Noir' '(no genres listed)']


In [15]:
# Will be deleting rows with no genre since other approaches do not work for predciting genre from title name
processed_data = df.drop(df[df['genres'] == '(no genres listed)'].index)

In [17]:
all_genres = processed_data['genres'].str.split('|').explode().unique()

# Print the unique genres
print("Distinct Genres:")
print(all_genres)

Distinct Genres:
['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'IMAX' 'War'
 'Musical' 'Documentary' 'Western' 'Film-Noir']


In [36]:
# Convert the data into a matrix format
user_item_matrix = pd.get_dummies(processed_data['title']).groupby(processed_data['movieId']).max().reset_index()

# Set 'movieId' as the index (optional)
user_item_matrix.set_index('movieId', inplace=True)

# Fill missing values (NaN) with zeros (optional)
user_item_matrix = user_item_matrix.fillna(0)

# Print the resulting user-item matrix
# print(user_item_matrix)

In [37]:
# Split the dataset into training and testing sets
train_processed_data, test_processed_data = train_test_split(processed_data, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print(f"Training set size: {len(train_processed_data)}")
print(f"Testing set size: {len(test_processed_data)}")

Training set size: 8257
Testing set size: 2065


In [40]:
item_item_matrix = user_item_matrix.T

In [41]:
item_similarity = cosine_similarity(item_item_matrix)

In [47]:
item_similarity_df = pd.DataFrame(item_similarity, index=item_item_matrix.index, columns=item_item_matrix.index)

# Function to recommend movies for a user based on similarity
def recommend_movies(user_item_matrix, item_similarity, user_interactions):
    recommended_movies = []
    
    for movie in user_interactions:
        similar_movies = item_similarity_df[movie].sort_values(ascending=False)
        recommended_movies.extend(similar_movies.index)
    
    recommended_movies = list(set(recommended_movies) - set(user_interactions))
    return recommended_movies

# Example: Recommend movies for a user based on their interactions
user_interactions = [1, 2]
recommended_movies = recommend_movies(user_item_matrix, item_similarity, user_interactions)
print("Recommended movies:", recommended_movies)

Recommended movies: [98304, 3, 4, 5, 6, 7, 8, 32777, 10, 9, 11, 32781, 12, 32783, 65552, 13, 14, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 65567, 31, 32, 33, 34, 35, 36, 39, 40, 65577, 42, 41, 43, 44, 45, 46, 47, 65585, 48, 49, 50, 53, 52, 54, 55, 32825, 98361, 57, 65596, 58, 59, 63, 64, 65601, 65, 66, 68, 98373, 69, 70, 71, 72, 73, 75, 32844, 15, 74, 76, 78, 16, 79, 32851, 80, 32853, 86, 87, 88, 89, 90, 17, 92, 93, 32862, 65631, 94, 97, 95, 96, 100, 101, 102, 103, 104, 105, 65642, 32875, 107, 108, 110, 111, 112, 113, 32882, 114, 115, 116, 117, 118, 121, 122, 123, 32892, 65660, 124, 125, 126, 129, 32898, 131, 132, 135, 137, 138, 140, 141, 144, 145, 65682, 146, 147, 65685, 149, 150, 151, 152, 154, 153, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 162, 168, 169, 170, 171, 172, 173, 174, 32943, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 32954, 98491, 131258, 184, 186, 187, 188, 190, 191, 65731, 193, 194, 32966, 198, 195, 196, 199, 200, 65740, 201, 202, 207, 208, 209

In [51]:
# Create a DataFrame for item similarity
item_similarity_df = pd.DataFrame(item_similarity, index=item_item_matrix.index, columns=item_item_matrix.index)



In [58]:
genres_matrix = df['genres'].str.get_dummies(sep='|')

# Calculate movie similarity based on genres (cosine similarity)
movie_similarity = cosine_similarity(genres_matrix, genres_matrix)

def recommend_movies_by_genre(movie_id):
    # Get similarity score
    similar_movies = list(enumerate(movie_similarity[movie_id]))
    
    # Sort movies by similarity (descending)
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    
    # Get top recommendations (excluding the input movie itself)
    top_recommendations = similar_movies[1:6]  # Get the top 5 recommendations
    
    # Get the movie IDs of recommended movies
    recommended_movie_ids = [movie[0] for movie in top_recommendations]
    
    return recommended_movie_ids

movie_id = 1  
recommended_movies = recommend_movies_by_genre(movie_id)

recommended_movie_titles = df.loc[df['movieId'].isin(recommended_movies)]['title']

print("Recommended movies for Movie ID", movie_id, ":")
print(recommended_movie_titles)

Recommended movies for Movie ID 1 :
51                                  Georgia (1995)
100     Rumble in the Bronx (Hont faan kui) (1995)
1284                            Money Talks (1997)
Name: title, dtype: object
