In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from surprise import Reader, Dataset, SVD, SVDpp, evaluate, KNNBasic, CoClustering

In [3]:
RAW_DATA_PATH = '../data/raw'
INTERIM_DATA_PATH = '../data/interim/'

In [4]:
users_df = pd.read_csv(RAW_DATA_PATH + '/users.txt', delimiter='::', 
                       names=['user_id','gender','age','occupation','zip_code'])
movies_df = pd.read_csv(RAW_DATA_PATH + '/movies.txt', delimiter='::',
                        names=['movie_id','title','genre'])
train = pd.read_csv(RAW_DATA_PATH + '/training.txt', delimiter=',', 
                    names=['user_id', 'movie_id','ratings','timestamp'])
test = pd.read_csv(RAW_DATA_PATH + '/testing.txt', delimiter=',', 
                    names=['user_id', 'movie_id','timestamp'])

In [5]:
movies_df['year'] = movies_df['title'].apply(lambda x: x[-5:-1])
movies_df['title'] = movies_df['title'].apply(lambda x: x[0:-7])
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.split('|'))
movies_df['count_genre'] = movies_df['genre'].apply(lambda x: len(x))

In [6]:
train = train.merge(movies_df, how='left', on='movie_id')
test = test.merge(movies_df, how='left', on='movie_id')

In [7]:
train.groupby('movie_id').count().reset_index()

Unnamed: 0,movie_id,user_id,ratings,timestamp,title,genre,year,count_genre
0,1,1478,1478,1478,1478,1478,1478,1478
1,2,484,484,484,484,484,484,484
2,3,320,320,320,320,320,320,320
3,4,121,121,121,121,121,121,121
4,5,207,207,207,207,207,207,207
5,6,652,652,652,652,652,652,652
6,7,319,319,319,319,319,319,319
7,8,58,58,58,58,58,58,58
8,9,76,76,76,76,76,76,76
9,10,623,623,623,623,623,623,623


In [8]:
reader = Reader()
df = Dataset.load_from_df(train[['user_id', 'movie_id', 'ratings']], reader)
df.split(n_folds=5)

In [9]:
svd = SVDpp(random_state=1, n_epochs=25, reg_all=0.4, lr_all=0.007)
# evaluate(svd, df, measures=['RMSE', 'MAE'])

In [10]:
from surprise.model_selection import GridSearchCV
svd_param_grid = {'n_epochs': [20, 25], 
                  'lr_all': [0.007, 0.009, 0.01],
                  'reg_all': [0.4, 0.6]}

svdpp_gs = GridSearchCV(SVDpp, svd_param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
svdpp_gs.fit(df)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/christianwbsn/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-b456ed2d18e8>", line 7, in <module>
    svdpp_gs.fit(df)
  File "/home/christianwbsn/.local/lib/python3.6/site-packages/surprise/model_selection/search.py", line 90, in fit
    verbose=self.joblib_verbose)(delayed_list)
  File "/home/christianwbsn/.local/lib/python3.6/site-packages/joblib/parallel.py", line 994, in __call__
    self.retrieve()
  File "/home/christianwbsn/.local/lib/python3.6/site-packages/joblib/parallel.py", line 897, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/home/christianwbsn/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 515, in wrap_future_result
    return future.result(timeout=timeout)
  File "/home/christianwbsn/.local/lib/python3.6/site-packages/joblib/externals/loky/_bas

KeyboardInterrupt: 

In [None]:
print('RMSE =', svdpp_gs.best_params['rmse'])
print('MAE =', svdpp_gs.best_params['mae'])

In [None]:
coc = CoClustering(n_cltr_u=4,n_cltr_i=4,random_state=1) 
evaluate(coc, df, measures=['RMSE', 'MAE'])

In [None]:
import os
from surprise import SVD
from surprise import evaluate, print_perf
from surprise import KNNBasic

#########---------------SVD
print('')
print('---------------SVD result-------------')
algo = SVD()
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)


#########---------------PMF
print('')
print('---------------PMF result--------------')
algo = SVD(biased=False)
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)

##########--------------NMF
print('')
print('----------------NMF result--------------')
algo = KNNBasic(sim_options = {'user_based':True})
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)


##########--------------User Based Collaborative Filtering algorithm
print('')
print('User Based Collaborative Filtering algorithm result')
algo = KNNBasic(sim_options = {'user_based': False })
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)




##########--------------Item Based Collaborative Filtering algorithm
print('')
print('Item Based Collaborative Filtering algorithm result')
algo = KNNBasic(sim_options = {'user_based': False})
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)


##########--------MSD------User Based Collaborative Filtering algorithm
print('')
print('MSD----User Based Collaborative Filtering algorithm result')
algo = KNNBasic(sim_options = {'name':'MSD','user_based': True})
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)


##########--------cosin------User Based Collaborative Filtering algorithm
print('')
print('cosin----User Based Collaborative Filtering algorithm result')
algo = KNNBasic(sim_options = {'name':'cosine','user_based': True})
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)

##########--------person------User Based Collaborative Filtering algorithm
print('')
print('Person sim----User Based Collaborative Filtering algorithm result')
algo = KNNBasic(sim_options = {'name':'pearson','user_based': True})
perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
print_perf(perf)


##########--------MSD------User Based Collaborative Filtering algorithm
print('')
print('10--Neighboors--User Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':True })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)


##########--------cosin------User Based Collaborative Filtering algorithm
print('')
print('10---Neighboors---Item Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':False })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)


##########--------MSD------User Based Collaborative Filtering algorithm
print('')
print('15--Neighboors--User Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':True })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)


##########--------cosin------User Based Collaborative Filtering algorithm
print('')
print('15---Neighboors---Item Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':False })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)


##########--------MSD------User Based Collaborative Filtering algorithm
print('')
print('25--Neighboors--User Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':True })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)


##########--------cosin------User Based Collaborative Filtering algorithm
print('')
print('25---Neighboors---Item Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':False })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)



##########--------MSD------User Based Collaborative Filtering algorithm
print('')
print('30--Neighboors--User Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':True })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)


##########--------cosin------User Based Collaborative Filtering algorithm
print('')
print('30---Neighboors---Item Based Collaborative Filtering algorithm result')
algo = KNNBasic(k=10, sim_options = {'name':'MSD', 'user_based':False })
perf = evaluate(algo, df, measures=['RMSE'])
print_perf(perf)

In [10]:
trainset = df.build_full_trainset()
svd.fit(trainset)
# coc.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f24d158c198>

In [11]:
ids, pred_svd, pred_coc = [], [], []
for index, row in test.iterrows():
    ids.append(index+1)
    pred_svd.append(svd.predict(row.user_id, row.movie_id).est)
#     pred_coc.append(coc.predict(row.user_id, row.movie_id).est)

In [12]:
submission = pd.DataFrame({
    'ID': ids,
    'Predicted': pred_svd,
})

In [None]:
submission['Predicted'] = submission['Predicted_SVD']

In [13]:
submission

Unnamed: 0,ID,Predicted
0,1,3.111218
1,2,2.766018
2,3,3.481931
3,4,3.695421
4,5,4.154551
5,6,3.745864
6,7,3.893176
7,8,3.365047
8,9,3.147800
9,10,2.345799


In [None]:
submission = submission.drop(['Predicted_SVD', 'Predicted_COC'], axis=1)

In [14]:
submission.to_csv('../data/submission/submission_15.csv', index=False)