### Load Data

In [1]:
import pandas as pd

In [2]:
links = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/links.csv')
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [3]:
movies = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/movies.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/ratings.csv')


In [6]:
tags = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/tags.csv')

In [7]:
ratings = ratings.drop('timestamp', axis=1)

In [8]:
ratings.shape

(100836, 3)

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Merge Data Frames

In [10]:
new = pd.merge(links, movies, on='movieId')

In [11]:
new_2 = pd.merge(new, ratings, on='movieId')

In [12]:
new_3 = pd.merge(new_2, tags, on='movieId')

### Preprocessing

In [13]:
# Check for null values
new_3.isnull().sum()

movieId      0
imdbId       0
tmdbId       0
title        0
genres       0
userId_x     0
rating       0
userId_y     0
tag          0
timestamp    0
dtype: int64

In [14]:
# how many unique values in movieId column
new_3['movieId'].nunique()

1554

In [15]:
# replace | with , in the genres column
new_3['genres'] = new_3['genres'].str.replace('|',' , ')

In [16]:
#df = new_3.drop_duplicates(subset=['movieId'])

In [17]:
new_3

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,userId_x,rating,userId_y,tag,timestamp
0,1,114709,862.0,Toy Story (1995),"Adventure , Animation , Children , Comedy , Fa...",1,4.0,336,pixar,1139045764
1,1,114709,862.0,Toy Story (1995),"Adventure , Animation , Children , Comedy , Fa...",1,4.0,474,pixar,1137206825
2,1,114709,862.0,Toy Story (1995),"Adventure , Animation , Children , Comedy , Fa...",1,4.0,567,fun,1525286013
3,1,114709,862.0,Toy Story (1995),"Adventure , Animation , Children , Comedy , Fa...",5,4.0,336,pixar,1139045764
4,1,114709,862.0,Toy Story (1995),"Adventure , Animation , Children , Comedy , Fa...",5,4.0,474,pixar,1137206825
...,...,...,...,...,...,...,...,...,...,...
233208,187595,3778644,348350.0,Solo: A Star Wars Story (2018),"Action , Adventure , Children , Sci-Fi",586,5.0,62,star wars,1528934552
233209,193565,1636780,71172.0,Gintama: The Movie (2010),"Action , Animation , Comedy , Sci-Fi",184,3.5,184,anime,1537098582
233210,193565,1636780,71172.0,Gintama: The Movie (2010),"Action , Animation , Comedy , Sci-Fi",184,3.5,184,comedy,1537098587
233211,193565,1636780,71172.0,Gintama: The Movie (2010),"Action , Animation , Comedy , Sci-Fi",184,3.5,184,gintama,1537098603


In [18]:
# convert df to a csv file
new_3.to_csv('/Users/gabrielwarner/Data-Science/Projects/phase_4/movies_2.csv')

### Explode genres column

In [19]:
# split genre column based on ,
new_3['new_genres'] = new_3['genres'].str.split(',', 10)

In [20]:
# explode new_genres
df = new_3.explode('new_genres')

In [21]:
# drop genres from df
df_2 = df.drop(columns=['genres'])

In [22]:
df_2.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,userId_x,rating,userId_y,tag,timestamp,new_genres
0,1,114709,862.0,Toy Story (1995),1,4.0,336,pixar,1139045764,Adventure
0,1,114709,862.0,Toy Story (1995),1,4.0,336,pixar,1139045764,Animation
0,1,114709,862.0,Toy Story (1995),1,4.0,336,pixar,1139045764,Children
0,1,114709,862.0,Toy Story (1995),1,4.0,336,pixar,1139045764,Comedy
0,1,114709,862.0,Toy Story (1995),1,4.0,336,pixar,1139045764,Fantasy


### OHE title

In [23]:
one_hot = pd.get_dummies(df_2['title'])

In [24]:
df = df_2.drop('title',axis = 1)

In [25]:
df = pd.concat([df, one_hot], axis=1)

In [26]:
df.shape

(765271, 1563)

In [27]:
one_hot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 765271 entries, 0 to 233212
Columns: 1554 entries, (500) Days of Summer (2009) to eXistenZ (1999)
dtypes: uint8(1554)
memory usage: 1.1 GB


### OHE tag

In [28]:
one_hot_tag = pd.get_dummies(df['tag'])

In [29]:
df = df.drop('tag',axis = 1)

In [30]:
df = pd.concat([df, one_hot_tag], axis=1)

In [31]:
one_hot_tag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 765271 entries, 0 to 233212
Columns: 1584 entries, "artsy" to zombies
dtypes: uint8(1584)
memory usage: 1.1 GB


### OHE new_genres

In [32]:
one_hot_genre = pd.get_dummies(df['new_genres'])

In [33]:
df =df.drop('new_genres',axis = 1)

In [34]:
df = pd.concat([df, one_hot_genre], axis=1)

### Change Data Frames to suprise DataFrame

In [37]:
import surprise

In [38]:
from surprise.model_selection import train_test_split
# from sklearn.model_selection import train_test_split

In [39]:
ratings.rating.describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [40]:
ratings.shape

(100836, 3)

In [41]:
# Reader is used to parse a file containing ratings
reader = surprise.Reader(rating_scale = (4., 5.))

In [42]:
# convert ratings to a surprise data frame
data = surprise.Dataset.load_from_df(ratings, reader)

In [43]:
# Train test split to get a trainset and testset (note that there is no X_train, y_train, X_test, y_test)
trainset, testset = train_test_split(data, test_size=0.2)

### KNNS Basic model

In [44]:
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

In [45]:
# Initiate knn model and fir to trainset
basic = knns.KNNBasic()
basic.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fbfc3411460>

In [46]:
# make predictions wih the model
predictions = basic.test(testset)

In [47]:
# Get the RMSE score
print(accuracy.rmse(predictions))

RMSE: 1.1582
1.1582144530999001


### KNN means model

In [48]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(trainset)
predictions = knn_means.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1574
1.1573620205145436


### Suprise with grid search

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [50]:
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

# create parameter grid 
param_grid = {'n_factors':[20, 50, 75, 100],'n_epochs': [5, 6, 7, 8, 9, 10, 11], 'lr_all': [0.002, .003, .004, 0.005],
               'reg_all': [.02, 0.4, .5, 0.6]}
# initiate model
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
# fit model to data
gs_model.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 2240 out of 2240 | elapsed: 13.4min finished


In [51]:
# initiate model with best parameters
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
# fit model to trainset
svd.fit(trainset)
# make predictions with test set to get accuracy
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 1.1551
1.1550838067282345


### KNN with GridSearchCV

In [52]:
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV

In [53]:

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [2, 3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs_1 = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs_1.fit(data)

print(gs_1.best_score["rmse"])
print(gs_1.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

### Grid Search SVD

In [54]:
# param_grid = {
        #"n_epochs": [5, 10, 15, 20, 30, 40, 50, 100],
        # "lr_all": [0.001, 0.002, 0.005],
       # "reg_all": [0.02, 0.08, 0.4, 0.6]
#}

# smaller grid for testing
param_grid = {
    "n_epochs": [15, 20, 25],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], refit=True, cv=5)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 1.1493754390232718
BEST MAE: 	 0.837759346097022
BEST params: 	 {'n_epochs': 25, 'lr_all': 0.002, 'reg_all': 0.02}


# Make the predictions

In [55]:
def get_movie_id(movie_title):
    
    # Filter Movies dataframe for movie_title
    movie_title_df = movies[['title', 'movieId']]
    # Get corresponding id from row
    #movie_id = movie_title_df.query('title==movie_title')['movieId']
  #  movie_id = movie_title_df['movieId']
    movie_id = movies[movies["title"] == movie_title]['movieId'].values[0]

    # Return movie_id of movie_title
    return movie_id


In [56]:
get_movie_id("Waiting to Exhale (1995)")

4

In [57]:
t = pd.DataFrame([[1, 5, 4]], columns=['u', 'i', 'rating'])
t

Unnamed: 0,u,i,rating
0,1,5,4


# Get recomended movies

In [58]:
info = {'u':[1],
       'i':[5],
       'rating':[4]}
data = surprise.Dataset.load_from_df(t, reader)

In [59]:
gs(data)

TypeError: 'GridSearchCV' object is not callable

In [None]:
def model_function(u, i, rating):
#  user ranks 5 movies
    info = {'u':[u],
       'i':[i],
       'rating':[rating]}
# convert to suprise df
    data = surprise.Dataset.load_from_df(info, reader)
    print(data)
# run through model
    gs(data)


In [None]:
model_function(1,5,4)

In [None]:
movies_id = get_movie_id('Father of the Bride Part II (1995)')
# pass movie_id and a user_id to model to make recs
gs.test(testset)
gs.predict()