# Surprise Library

In [4]:
import numpy as np
import pandas as pd

from datetime import date

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

## Get Sample

In [2]:
## Note: csv_file is not included in the repo

def get_sample(csv_file, N, seed):
    np.random.seed(seed)
    ratings = pd.read_csv(csv_file)
    
    #keep only ratings from the most recent 5 years： 2010-2015
    ratings['year'] = ratings['timestamp'].apply(lambda x: date.fromtimestamp(x).year)
    ratings = ratings[ratings['year']>=date.fromisoformat('2010-01-01').year]
    ratings.drop(['timestamp','year'],axis=1,inplace=True) #'timestamp' and 'year' no longer useful 
    
    #keep only movies with at least 20 ratings
    m_counts = ratings.groupby('movieId').size()
    m_index = m_counts[m_counts>=20].index
    ratings = ratings[ratings['movieId'].isin(m_index)]
    
    #keep only users who have rated at least 20 movies - consistent with original data
    u_counts = ratings.groupby('userId').size()
    u_index = u_counts[u_counts>=20].index
    ratings = ratings[ratings['userId'].isin(u_index)]
    
    #sample out N users
    users = np.unique(ratings['userId'])
    u_index_sample = np.random.choice(users, N)
    sample = ratings[ratings['userId'].isin(u_index_sample)]
    
    #keep a percentage of ratings for each user: 30% -- lower variance
    pct = pd.DataFrame(columns=['userId','movieId','rating'])
    u_N, u_rating_count = np.unique(sample['userId'], return_counts=True)
    
    for i in range(len(u_N)):
        u_id = u_N[i]
        u_sample = sample[sample['userId']==u_id]
        u_sample = u_sample[u_sample['movieId'].isin(np.random.choice(np.unique(u_sample['movieId']),
                                                                      int(u_rating_count[i]*0.3)))]
        pct = pct.append(u_sample)
    
    sample = pct
    
#     #plot data
#     users_s, u_count_s = np.unique(sample['userId'], return_counts=True)
#     plt.plot(np.arange(1, len(users_s)+1), sorted(sample.groupby('userId').size(), reverse=True))
#     plt.xlabel('number of users')
#     plt.ylabel('number of rated movies')
#     plt.show()
    
    print('number of unique users:', len(np.unique(sample['userId'])))
    print('number of unique movies:', len(np.unique(sample['movieId'])))
    
    return sample


In [3]:
def get_train_test(sample_data, split, seed):
    np.random.seed(seed)
    sample = sample_data
    
    train, test = train_test_split(sample, test_size=split)
    
    #check
    print('number of users in train:',len(np.unique(train['userId'])))
    print('number of movies in train:',len(np.unique(train['movieId'])))
    print('number of users in test:', len(np.unique(test['userId'])))
    print('number of movies in test:',len(np.unique(test['movieId'])))
    
    return train, test
    

In [5]:
sample = get_sample('ratings.csv', 1000, 1)

number of unique users: 974
number of unique movies: 5619


## Package

In [None]:
# !pip install numpy
# !pip install scikit-surprise

In [19]:
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader

In [21]:
#define parameter
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(sample[['userId','movieId','rating']], reader)

train, test = train_test_split(data, test_size=.3)

In [22]:
#Baseline
from surprise import BaselineOnly

algo = BaselineOnly()
algo.fit(train)
predictions = algo.test(test)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.8700


0.8700131439977484

In [23]:
#KNN Basic
from surprise import KNNBasic

algo = KNNBasic()
algo.fit(train)
predictions = algo.test(test)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0148


1.014822991493837

In [24]:
#KNN adjusted for Z-Score
from surprise import KNNWithZScore

algo = KNNWithZScore()
algo.fit(train)
predictions = algo.test(test)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9356


0.9356376753748737

In [25]:
#KNN Adjusted for Baseline
from surprise import KNNBaseline

algo = KNNBaseline()
algo.fit(train)
predictions = algo.test(test)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9213


0.921254895302728