# Imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
#from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, pairwise_distances
from surprise import Reader, Dataset, SVD, SVDpp, NormalPredictor, accuracy
from surprise.model_selection import cross_validate, GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor

In [3]:
df_ratings = pd.read_csv('../data/raw/ratings.csv')
df_ratings.head()

Unnamed: 0,user_id,movieID,rating
0,1264,2363,3.5
1,213,8368,2.5
2,593,64032,3.0
3,609,54995,4.0
4,1590,5005,4.0


# Base Models

In [4]:
reader = Reader(rating_scale=(0.5, 5))
#data = Dataset.load_from_df(df_ratings[['user_id', 'movieID', 'rating']], reader)
train_data, test_data = train_test_split(df_ratings, test_size=0.01)
train_data = Dataset.load_from_df(train_data[['user_id', 'movieID', 'rating']], reader)
trainset_base = train_data.build_full_trainset()
y_test = test_data[['rating']]

## SVD

In [5]:
%%time
svd_base = SVD(verbose=True)
svd_base.fit(trainset_base)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9510072460>

In [7]:
%%time
y_test_svd_base = []
    
for index, row in test_data.iterrows():
    pred = svd_base.predict(row.user_id, row.movieID)
    y_test_svd_base.append(pred.est)
    
print('SVD RMSE on Testdata:', sqrt(mean_squared_error(y_test, y_test_svd_base)))

SVD RMSE on Testdata: 0.7654960695459692


## SVD++

In [10]:
%%time
algo_svdpp_base = SVDpp(n_factors=160, n_epochs=10, lr_all=0.005, reg_all=0.1, verbose=True)
algo_svdpp_base.fit(trainset_base)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f95106dba00>

In [11]:
%%time
y_test_svdpp_base = []
    
for index, row in test_data.iterrows():
    pred = algo_svdpp_base.predict(row.user_id, row.movieID)
    y_test_svdpp_base.append(pred.est)

print('SVD++ RMSE on Testdata:', sqrt(mean_squared_error(y_test, y_test_svdpp_base)))

SVD++ RMSE on Testdata: 0.8041141264755429
CPU times: user 8.63 s, sys: 105 ms, total: 8.73 s
Wall time: 8.81 s


# Random Forest

In [13]:
%%time
kf = KFold(n_splits=3)
svd = SVD(verbose=True)
algo_svdpp = SVDpp(n_epochs=10, lr_all=0.005, reg_all=0.1, verbose=True)
y_pred_svd = []
y_pred_svdpp = []
y_test_svd = []
y_test_svdpp = []
for trainset, testset in kf.split(train_data):
    y_test_svd = []
    y_test_svdpp = []
    svd.fit(trainset)
    algo_svdpp.fit(trainset)
    predictions_svd = svd.test(testset)
    predictions_svdpp = algo_svdpp.test(testset)
    for pred in predictions_svd:
        y_pred_svd.append(pred.est)
    for pred in predictions_svdpp:
        y_pred_svdpp.append(pred.est)
    for index, row in test_data.iterrows():
        pred = svd.predict(row.user_id, row.movieID)
        y_test_svd.append(pred.est)
    print('SVD RMSE on Testdata:', sqrt(mean_squared_error(y_test, y_test_svd)))
    for index, row in test_data.iterrows():
        pred = algo_svdpp.predict(row.user_id, row.movieID)
        y_test_svdpp.append(pred.est)
    print('SVD++ RMSE on Testdata:', sqrt(mean_squared_error(y_test, y_test_svdpp)))
    

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
SVD RMSE on Testdata: 0.7944648549032107
SVD++ RMSE on Testdata: 0.7944648549032107
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch

In [23]:
%%time
# stack base prediction data
meta_data_train = {'stacked_input1': y_test_svd, 'stacked_input2': y_test_svdpp}
meta_data_train = pd.DataFrame(meta_data_train)
meta_data_test = {'stacked_input1_test': y_test_svd_base, 'stacked_input1_test': y_test_svdpp_base}
meta_data_test = pd.DataFrame(meta_data_test)
y_meta = train_data['rating']

TypeError: 'DatasetAutoFolds' object is not subscriptable

In [22]:
%%time
forest = RandomForestRegressor()
forest.fit(meta_data_train, y_meta)
y_pred = forest.predict(meta_data_test)
print('RMSE on Meta Model:', sqrt(mean_squared_error(y_test, y_pred)))

NameError: name 'y_meta' is not defined