### Load Data

In [1]:
import pandas as pd

In [2]:
links = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/links.csv')

In [3]:
movies = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/movies.csv')

In [4]:
ratings = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/ratings.csv')

In [5]:
tags = pd.read_csv('/Users/gabrielwarner/Downloads/ml-latest-small/tags.csv')

In [6]:
ratings = ratings.head().drop('timestamp', axis=1)

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Merge Data Frames

In [8]:
new = pd.merge(links, movies)

In [9]:
new_2 = pd.merge(new, ratings)

In [10]:
# Drop movieId and timestamp
tags_2 = tags.drop(columns=['movieId', 'timestamp'])

In [11]:
frames = [new_2, tags_2]
new_3 = pd.concat(frames)
#new_3 = pd.merge(new_2, tags)


In [12]:
#df_pre = pd.merge(new_2, tags_2)

### Preprocessing

In [13]:
# Check for null values
new_3.isnull().sum()

movieId    3683
imdbId     3683
tmdbId     3683
title      3683
genres     3683
userId        0
rating     3683
tag           5
dtype: int64

In [14]:
df = new_3.fillna('')
#df = df.apply(lambda x: pd.Series(x.dropna().values))
df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,userId,rating,tag
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4,
1,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance,1,4,
2,6,113277,949,Heat (1995),Action|Crime|Thriller,1,4,
3,47,114369,807,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5,
4,50,114814,629,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5,
...,...,...,...,...,...,...,...,...
3678,,,,,,606,,for katie
3679,,,,,,606,,austere
3680,,,,,,610,,gun fu
3681,,,,,,610,,heroic bloodshed


In [15]:
# Drop null values
#df = df_pre.dropna()

In [16]:
# Check null values
df.isnull().sum()

movieId    0
imdbId     0
tmdbId     0
title      0
genres     0
userId     0
rating     0
tag        0
dtype: int64

In [17]:
# replace | with , in the genres column
df['genres'] = df['genres'].str.replace('|',' , ')

In [18]:
df = df.drop_duplicates(subset=['movieId'])

In [19]:
# convert df to a csv file
df.to_csv('/Users/gabrielwarner/Data-Science/Projects/phase_4/movies_2.csv')

### Explode genres column

In [20]:
df['new_genres'] = df['genres'].str.split(',', 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['new_genres'] = df['genres'].str.split(',', 10)


In [21]:
df = df.explode('new_genres')

In [22]:
# drop genres from df
df = df.drop(columns=['genres'])

In [23]:
df

Unnamed: 0,movieId,imdbId,tmdbId,title,userId,rating,tag,new_genres
0,1.0,114709.0,862.0,Toy Story (1995),1,4.0,,Adventure
0,1.0,114709.0,862.0,Toy Story (1995),1,4.0,,Animation
0,1.0,114709.0,862.0,Toy Story (1995),1,4.0,,Children
0,1.0,114709.0,862.0,Toy Story (1995),1,4.0,,Comedy
0,1.0,114709.0,862.0,Toy Story (1995),1,4.0,,Fantasy
1,3.0,113228.0,15602.0,Grumpier Old Men (1995),1,4.0,,Comedy
1,3.0,113228.0,15602.0,Grumpier Old Men (1995),1,4.0,,Romance
2,6.0,113277.0,949.0,Heat (1995),1,4.0,,Action
2,6.0,113277.0,949.0,Heat (1995),1,4.0,,Crime
2,6.0,113277.0,949.0,Heat (1995),1,4.0,,Thriller


### OHE title

In [24]:
one_hot = pd.get_dummies(df['title'])

In [25]:
df = df.drop('title',axis = 1)

In [26]:
df = pd.concat([df, one_hot], axis=1)

In [27]:
df.shape

(16, 13)

In [28]:
one_hot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 0
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0                                16 non-null     uint8
 1   Grumpier Old Men (1995)      16 non-null     uint8
 2   Heat (1995)                  16 non-null     uint8
 3   Seven (a.k.a. Se7en) (1995)  16 non-null     uint8
 4   Toy Story (1995)             16 non-null     uint8
 5   Usual Suspects, The (1995)   16 non-null     uint8
dtypes: uint8(6)
memory usage: 224.0 bytes


### OHE tag

In [29]:
one_hot_tag = pd.get_dummies(df['tag'])

In [30]:
df = df.drop('tag',axis = 1)

In [31]:
df = pd.concat([df, one_hot_tag], axis=1)

In [32]:
one_hot_tag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 0
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0           16 non-null     uint8
 1   funny   16 non-null     uint8
dtypes: uint8(2)
memory usage: 160.0 bytes


### OHE new_genres

In [33]:
one_hot_genre = pd.get_dummies(df['new_genres'])

In [34]:
df =df.drop('new_genres',axis = 1)

In [35]:
df = pd.concat([df, one_hot_genre], axis=1)

### Base Line Model

In [36]:
from sklearn.neighbors import NearestNeighbors

In [37]:
#knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
#knn.fit(df)

### Suprise

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 0
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   movieId                      16 non-null     object
 1   imdbId                       16 non-null     object
 2   tmdbId                       16 non-null     object
 3   userId                       16 non-null     int64 
 4   rating                       16 non-null     object
 5                                16 non-null     uint8 
 6   Grumpier Old Men (1995)      16 non-null     uint8 
 7   Heat (1995)                  16 non-null     uint8 
 8   Seven (a.k.a. Se7en) (1995)  16 non-null     uint8 
 9   Toy Story (1995)             16 non-null     uint8 
 10  Usual Suspects, The (1995)   16 non-null     uint8 
 11                               16 non-null     uint8 
 12  funny                        16 non-null     uint8 
 13                               16 non-nu

### Change Data Frames to suprise DataFrame

In [39]:
import surprise

In [40]:
from surprise.model_selection import train_test_split
# from sklearn.model_selection import train_test_split

In [41]:
ratings.rating.describe()

count    5.000000
mean     4.400000
std      0.547723
min      4.000000
25%      4.000000
50%      4.000000
75%      5.000000
max      5.000000
Name: rating, dtype: float64

In [42]:
reader = surprise.Reader(rating_scale = (4., 5.))

In [43]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [44]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [45]:
data = surprise.Dataset.load_from_df(ratings, reader)

In [46]:
df.head()

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,Unnamed: 6,Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),Toy Story (1995),...,Crime,Fantasy,Mystery,Romance,Thriller,Action,Adventure,Comedy,Crime.1,Mystery.1
0,1,114709,862,1,4,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
0,1,114709,862,1,4,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0,1,114709,862,1,4,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0,1,114709,862,1,4,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0,1,114709,862,1,4,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [47]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [48]:
# from sklearn.model_selection import train_test_split

In [49]:
trainset, testset = train_test_split(data, test_size=0.2)

In [50]:
#trainset.head()

In [51]:
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

In [52]:
basic = knns.KNNBasic()
basic.fit(trainset)


Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fe2ef6b6310>

In [53]:
predictions = basic.test(testset)

In [54]:
print(accuracy.rmse(predictions))

RMSE: 0.5000
0.5


In [55]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(trainset)
predictions = knn_means.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.5000
0.5


### Matrix Factorization

In [56]:
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

### Suprise with grid search

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
param_grid = {'n_factors':[20, 50, 75, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
               'reg_all': [0.4, 0.6]}

In [59]:
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors':[20, 50, 75, 100],'n_epochs': [5, 6, 7, 8, 9, 10, 11], 'lr_all': [0.002, .003, .004, 0.005],
               'reg_all': [.02, 0.4, .5, 0.6]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 111 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1960 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 2240 out of 2240 | elapsed:    1.0s finished


In [60]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.5097
0.5097069973823878


In [61]:
#dictionaryObject = df.to_dict()

In [62]:
#dictionaryObject

In [63]:
#DF = pd.DataFrame()
#for key in dictionaryObject.keys():
    #df = pd.DataFrame(columns=['User', 'Item', 'Rating'])
    #df['Rating'] = pd.Series(dictionaryObject[key])
    #df['Item'] = pd.DataFrame(df.index)
    #df['User'] = key

    #DF = pd.concat([DF, df], axis = 0)

#DF = DF.reset_index(drop=True)

In [64]:
#DF.info()

### Graph Lab Model

In [68]:
# Make a train-test split
train_data, validate_data = df.random_split(0.8)

AttributeError: 'DataFrame' object has no attribute 'random_split'

<surprise.trainset.Trainset at 0x7fe2ef6ae460>