In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column

# Reading movies file
df = pd.read_csv('movies_2019.csv',usecols=['name', 'genre'])

In [3]:
# Break up the big genre string into a string array
df['genre'] = df['genre'].str.split('|')
# Convert genres to string value
df['genre'] = df['genre'].fillna("").astype('str')
df

Unnamed: 0,genre,name
0,"['Drama, Sci-Fi, Thriller ']",Glass
1,"['Action, Adventure, Family ']",The Kid Who Would Be King
2,"['Action, Crime, Drama ']",Miss Bala
3,"['Animation, Action, Adventure ']",The Lego Movie 2: The Second Part
4,"['Comedy, Fantasy, Romance ']",What Men Want
5,"['Action, Adventure, Sci-Fi ']",Alita: Battle Angel
6,"['Biography, Comedy, Drama ']",Fighting with My Family
7,"['Comedy, Fantasy, Romance ']",Isn't It Romantic
8,"['Drama, Horror, Mystery ']",Happy Death Day 2U
9,"['Animation, Action, Adventure ']",How to Train Your Dragon: The Hidden World


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['genre'])
tfidf_matrix.shape

(103, 70)

In [6]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.        , 0.0607877 , 0.        ],
       [0.        , 1.        , 0.09284477, 0.35346896],
       [0.0607877 , 0.09284477, 1.        , 0.09674378],
       [0.        , 0.35346896, 0.09674378, 1.        ]])

In [7]:
# Build a 1-dimensional array with movie titles
titles = df['name']
indices = pd.Series(df.index, index=df['name'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
#     newtitle = title + '\xa0'
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [9]:
genre_recommendations('Captain Marvel').head(20)

11                                    Captain Marvel
27                                 Avengers: Endgame
40                                      Dark Phoenix
48                         Spider-Man: Far from Home
78                             Terminator: Dark Fate
98                                   Lucy in the Sky
36                                        Brightburn
70                                        Gemini Man
68                                Doom: Annihilation
0                                              Glass
20                                           Shazam!
30                         Pokémon Detective Pikachu
42                       Men in Black: International
52             Fast & Furious Presents: Hobbs & Shaw
82                                  Charlie's Angels
90    Untitled Jumanji: Welcome to the Jungle Sequel
3                  The Lego Movie 2: The Second Part
9         How to Train Your Dragon: The Hidden World
94                                 Spies in Di