In [37]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
#Import movies csv
movies_df = pd.read_csv('data/movies.csv')
movies_df.head()
# len(movies_df)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [39]:
#Import ratings csv
ratings_df = pd.read_csv('data/ratings.csv', parse_dates=['timestamp'])
ratings_df = ratings_df.drop(columns=['timestamp'])
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,16,4.0
1,1,24,1.5
2,1,32,4.0
3,1,47,4.0
4,1,50,4.0


In [40]:
#Combine 'movies' and 'ratings' dataframes and drop timestamp column
combined_df = pd.merge(ratings_df, movies_df, how='right', on='movieId')
# combined_df = combined_df.drop(columns=['timestamp'])
# combined_df

In [41]:
#Extract a list of individual genres from 'genres' column in df
genres_list = []
unique_genres = []
for genre in combined_df.genres.unique():
    genres_list.append(genre)
for x in range(len(genres_list)):
    genres_list[x] = genres_list[x].split('|')
for x in genres_list:
    for genre in x:
        if genre not in unique_genres:
            unique_genres.append(genre)
# unique_genres

In [42]:
all_genres = []
for x in movies_df['genres']:
    all_genres.append(x)
# all_genres

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer(analyzer='word',stop_words='english')
tfidf_values = tfidf_model.fit_transform(movies_df['genres'])
# tfidf_values.shape

In [44]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_values, tfidf_values)
# cosine_sim.shape

In [45]:
movie_titles = np.array(movies_df['title'])
def recommend_movie(movie_title):
    for x in range(len(movie_titles)):
        if movie_titles[x] == movie_title:
            index = x
            similarity = list(enumerate(cosine_sim[index]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:11]
    movie_indices = [i[0] for i in similarity]
    return movie_titles[movie_indices] 

In [46]:
print(f"Because you liked Jumanji, we recommend the following movies: \n{recommend_movie('Jumanji (1995)')}")

Because you liked Jumanji, we recommend the following movies: 
['Indian in the Cupboard, The (1995)' 'NeverEnding Story III, The (1994)'
 'Escape to Witch Mountain (1975)'
 "Darby O'Gill and the Little People (1959)" 'Return to Oz (1985)'
 'NeverEnding Story, The (1984)'
 'NeverEnding Story II: The Next Chapter, The (1990)'
 'Santa Claus: The Movie (1985)'
 "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)"
 'Magic in the Water (1995)']


In [47]:
# matrix = ratings_df.to_numpy()
# matrix = matrix.astype(float)
# matrix

array([[1.00000e+00, 1.60000e+01, 4.00000e+00],
       [1.00000e+00, 2.40000e+01, 1.50000e+00],
       [1.00000e+00, 3.20000e+01, 4.00000e+00],
       ...,
       [6.68000e+02, 1.43385e+05, 4.00000e+00],
       [6.68000e+02, 1.44976e+05, 2.50000e+00],
       [6.68000e+02, 1.48626e+05, 4.50000e+00]])

In [56]:
X = ratings_df[['userId','movieId']]
y = ratings_df[['rating']]
print(X)
print(y)

        userId  movieId
0            1       16
1            1       24
2            1       32
3            1       47
4            1       50
...        ...      ...
105334     668   142488
105335     668   142507
105336     668   143385
105337     668   144976
105338     668   148626

[105339 rows x 2 columns]
        rating
0          4.0
1          1.5
2          4.0
3          4.0
4          4.0
...        ...
105334     4.0
105335     3.5
105336     4.0
105337     2.5
105338     4.5

[105339 rows x 1 columns]


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X,y)