In [36]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.preprocessing import LabelEncoder
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from surprise import Reader, Dataset, SVD
import joblib

In [37]:
# load dataset
ratings = pd.read_csv('./sources/tmdb_ratings.csv')
# isolate columns
x_cols = [c for c in ratings.columns]
ratings.head()

Unnamed: 0,userId,rating,tmdbId
0,1,2.5,9909.0
1,7,3.0,9909.0
2,31,4.0,9909.0
3,32,4.0,9909.0
4,36,3.0,9909.0


In [40]:
# fill missing values
train_mode = dict(ratings.mode().iloc[0])
ratings = ratings.fillna(train_mode)
print(train_mode)

{'userId': 547.0, 'rating': 4.0, 'tmdbId': 13.0}


In [41]:
# convert categoricals
encoders = {}
for column in ['userId', 'rating', 'tmdbId']:
    categorical_convert = LabelEncoder()
    ratings[column] = categorical_convert.fit_transform(ratings[column])
    encoders[column] = categorical_convert
print(ratings)

        userId  rating  tmdbId
0            0       4    2553
1            6       5    2553
2           30       7    2553
3           31       7    2553
4           35       5    2553
...        ...     ...     ...
99999      663       4    6507
100000     663       6    5431
100001     664       5    8292
100002     664       1    5821
100003     667       1    6612

[100004 rows x 3 columns]


In [42]:
# Collaborative filtering
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'rating', 'tmdbId']], reader)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f17c8d9bd60>

In [43]:
# test predict with (userId, tmdbId, etc)
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=5, details={'was_impossible': False})