In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
movies = pd.read_csv("movies.csv")  # contains movieId, title, genres

print("Sample movies data:")
movies.head()

Sample movies data:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# One-hot encode genres
# Split genres into separate columns
genres_split = movies['genres'].str.get_dummies(sep='|')
print(genres_split)

# Combine movieId + title + genre indicators
movies_content = pd.concat([movies[['movieId', 'title']], \
                            genres_split], axis=1)

print("\nMovies with one-hot encoded genres:")
print(movies_content.head())

      (no genres listed)  Action  Adventure  Animation  Children  Comedy  \
0                      0       0          1          1         1       1   
1                      0       0          1          0         1       0   
2                      0       0          0          0         0       1   
3                      0       0          0          0         0       1   
4                      0       0          0          0         0       1   
...                  ...     ...        ...        ...       ...     ...   
9737                   0       1          0          1         0       1   
9738                   0       0          0          1         0       1   
9739                   0       0          0          0         0       0   
9740                   0       1          0          1         0       0   
9741                   0       0          0          0         0       1   

      Crime  Documentary  Drama  Fantasy  Film-Noir  Horror  IMAX  Musical  \
0        

In [9]:
# Compute cosine similarity on genre vectors 
similarity_matrix = cosine_similarity(genres_split)

# Convert to DataFrame for readability
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=movies['title'], 
                             columns=movies['title'])
similarity_df

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000000,0.774597,0.316228,0.258199,0.447214,0.000000,0.316228,0.632456,0.000000,0.258199,...,0.447214,0.316228,0.316228,0.447214,0.0,0.670820,0.774597,0.00000,0.316228,0.447214
Jumanji (1995),0.774597,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.816497,0.000000,0.333333,...,0.000000,0.000000,0.000000,0.000000,0.0,0.288675,0.333333,0.00000,0.000000,0.000000
Grumpier Old Men (1995),0.316228,0.000000,1.000000,0.816497,0.707107,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.353553,0.000000,0.500000,0.000000,0.0,0.353553,0.408248,0.00000,0.000000,0.707107
Waiting to Exhale (1995),0.258199,0.000000,0.816497,1.000000,0.577350,0.000000,0.816497,0.000000,0.000000,0.000000,...,0.288675,0.408248,0.816497,0.000000,0.0,0.288675,0.333333,0.57735,0.000000,0.577350
Father of the Bride Part II (1995),0.447214,0.000000,0.707107,0.577350,1.000000,0.000000,0.707107,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.707107,0.000000,0.0,0.500000,0.577350,0.00000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.670820,0.288675,0.353553,0.288675,0.500000,0.288675,0.353553,0.000000,0.500000,0.288675,...,0.750000,0.353553,0.353553,0.500000,0.0,1.000000,0.866025,0.00000,0.707107,0.500000
No Game No Life: Zero (2017),0.774597,0.333333,0.408248,0.333333,0.577350,0.000000,0.408248,0.000000,0.000000,0.000000,...,0.577350,0.408248,0.408248,0.577350,0.0,0.866025,1.000000,0.00000,0.408248,0.577350
Flint (2017),0.000000,0.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.707107,0.707107,0.000000,0.0,0.000000,0.000000,1.00000,0.000000,0.000000
Bungo Stray Dogs: Dead Apple (2018),0.316228,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.000000,0.707107,0.408248,...,0.707107,0.500000,0.000000,0.707107,0.0,0.707107,0.408248,0.00000,1.000000,0.000000


In [11]:
#  Recommend movies similar to a given title 
def recommend_movies(movie_title, top_n=5):
    if movie_title not in similarity_df.index:
        print(f"{movie_title} not found in dataset.")
        return []
    # Get similarity scores
    sim_scores = similarity_df[movie_title].sort_values(
                                                ascending=False)
    # Exclude itself and pick top-N
    recommended = sim_scores.iloc[1:top_n+1]
    return recommended

In [13]:
# Example: Recommend movies similar to "Toy Story (1995)"
print("\nRecommendations for 'Toy Story (1995)':")
print(recommend_movies("Toy Story (1995)", top_n=5))


Recommendations for 'Toy Story (1995)':
title
Adventures of Rocky and Bullwinkle, The (2000)    1.0
Emperor's New Groove, The (2000)                  1.0
Monsters, Inc. (2001)                             1.0
Tale of Despereaux, The (2008)                    1.0
Wild, The (2006)                                  1.0
Name: Toy Story (1995), dtype: float64
