In [1]:
import pandas as pd

films = pd.read_csv('ml-latest-small/movies.csv')

print(films.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [2]:
genres = films['genres'].str.split('|')

genres

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [3]:
movies = films.copy()
movies.loc[:, 'genres'] = genres

print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                              genres  
0  [Adventure, Animation, Children, Comedy, Fantasy]  
1                     [Adventure, Children, Fantasy]  
2                                  [Comedy, Romance]  
3                           [Comedy, Drama, Romance]  
4                                           [Comedy]  


In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

df = movies.copy()

df_binarized = mlb.fit_transform(df['genres'])

df_binarized

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(9742, 20))

In [5]:
list(mlb.classes_)

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [6]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

cosine_similarities = 1-pairwise_distances(df_binarized, metric='cosine')

print(cosine_similarities[:5, :5])

[[1.         0.77459667 0.31622777 0.25819889 0.4472136 ]
 [0.77459667 1.         0.         0.         0.        ]
 [0.31622777 0.         1.         0.81649658 0.70710678]
 [0.25819889 0.         0.81649658 1.         0.57735027]
 [0.4472136  0.         0.70710678 0.57735027 1.        ]]


In [7]:
movie = df[df['title'].str.contains('age of ultron', case=False, na=False)].index[0]
movie

cosine_similarities[8686, 9714]

np.float64(0.0)

In [8]:
def movie_recommender(movie_name):
    df.reset_index()
    movie = df[df['title'].str.contains(movie_name.title(), case=False, na=False)].index[0]
    similar_movies = cosine_similarities[movie].argsort()[::-1][1:6]
    recommendation = df.iloc[similar_movies]['title']
    return recommendation

movie_recommender(input())

6727         Street Kings (2008)
2475          Boiler Room (2000)
9374       The Accountant (2016)
9375             Imperium (2016)
6659    Cassandra's Dream (2007)
Name: title, dtype: object