In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# pip install surprise

In [None]:
from datetime import datetime
import seaborn as sns
import os
import random
import matplotlib.pyplot as plt

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

import xgboost as xgb
from surprise import Reader, Dataset, BaselineOnly, KNNBaseline, SVD, SVDpp
from surprise.model_selection import GridSearchCV

### Data pre-processing

In [None]:
%%time
## coiuld replace with pd.read_csv and dropping NaNs/ : cols ? 

# DataFrame to store all imported data
if not os.path.isfile('data.cs'):
    data = open('data.csv', mode='w')

files = ['../input/netflix-prize-data/combined_data_1.txt',
         '../input/netflix-prize-data/combined_data_2.txt',
#          '../input/netflix-prize-data/combined_data_3.txt',  # read in half of data only for speedup
#          '../input/netflix-prize-data/combined_data_4.txt'
        ]

# Remove the line with movie_id: and add a new column of movie_id
# Combine all data files into a csv file
for file in files:
  print("Opening file: {}".format(file))
  with open(file) as f:
    for line in f:
        line = line.strip()
        if line.endswith(':'):
            movie_id = line.replace(':', '')
        else:
            data.write(movie_id + ',' + line)
            data.write('\n')
data.close()

# Read all data into a pd dataframe
df = pd.read_csv('data.csv', names=['movie_id', 'user_id','rating','date'])
print(df.nunique())
df

In [None]:
df.describe()["rating"]

In [None]:
df.isnull().sum()

## Removing the duplicated row

In [None]:
duplication = df.duplicated(["movie_id", "user_id", "rating"])
print("Number of duplication rows: "+str(duplication.sum()))

In [None]:
print("Total number of movie ratings: "+str(sum(df["rating"])))
print("Total number of users: "+str(len(np.unique(df["user_id"]))))
print("Total number of movies: " +str(len(np.unique(df["movie_id"]))))

## Exploratary Data Analysis

In [None]:
def labels(number):
    return str(number/10**6) + "M"

plt.figure(figsize=(12,8))
ax = sns.countplot(x="rating", data= df)
ax.set_yticklabels([labels(num) for num in ax.get_yticks()])

plt.tick_params(labelsize=15)
plt.title("Rating Distribution", fontsize=20)
plt.xlabel("Rating", fontsize=20)
plt.ylabel("Number of Rating(Millions)", fontsize=20)
plt.show()

In [None]:
df["date"] = pd.to_datetime(df["date"], errors='coerce')
df.head(5)

In [None]:
plt.figure(figsize=(10,8))
ax = df.resample("M", on = "date")["rating"].count().plot()

ax.set_yticklabels([labels(num) for num in ax.get_yticks()])
ax.set_title("Rating per Month", fontsize=20)
ax.set_xlabel("date", fontsize=20)
ax.set_ylabel("Rating per Month(millions)", fontsize=20)
plt.tick_params(labelsize=15)
plt.show()

## Rating Analyst by Giving User 

In [None]:
user_giving_rate = df.groupby("user_id")["rating"].count().sort_values(ascending=False)
user_giving_rate.head()

In [None]:
figs, axes = plt.subplots(nrows=1, ncols=2, figsize=(14,7))

sns.kdeplot(user_giving_rate.values, shade=True, ax=axes[0])
axes[0].set_title("PDF", fontsize=20)
axes[0].set_xlabel("Rating by User", fontsize=15)
axes[0].tick_params(labelsize=15)

sns.kdeplot(user_giving_rate.values, shade=True, cumulative=True, ax=axes[1])
axes[1].set_title("CDF", fontsize=20)
axes[1].set_xlabel("Rating by User", fontsize=15)
axes[1].tick_params(labelsize=15)

figs.subplots_adjust(wspace=2)
plt.tight_layout()
plt.show()

In [None]:
user_giving_rate.describe()

In [None]:
quantiles = user_giving_rate.quantile(np.arange(0, 1.01, 0.01 ))

In [None]:
fig = plt.figure(figsize=(10,6))

axes = fig.add_axes([0.1, 0.1, 1, 1])
axes.set_title("Quantile of Rating/User", fontsize=20)
axes.set_xlabel("Quantile", fontsize=20)
axes.set_ylabel("Rating Per User", fontsize=20)
axes.plot(quantiles)

plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c="blue", s=70,
           label="quantiles with 0.05 intervals")

plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c="red", s=70,
           label="quantiles with 0.25 intervals")

plt.legend(loc='upper left', fontsize=20)

for x, y in zip(quantiles.index[::25], quantiles.values[::25]):
    plt.annotate(s='({}, {})'.format(x, y), xy=(x, y), fontweight='bold',
                fontsize=16, xytext=(x-0.05, y+180))
    
axes.tick_params(labelsize=15)

In [None]:
quantiles[::5]

In [None]:
print("Total number of rating less than 75th percentile: " +str(sum(user_giving_rate.values <=132)))
print("Total number of rating more than 75th percentile: " +str(sum(user_giving_rate.values >132)))

## Movies Rating Analyst 

In [None]:
movies_rating = df.groupby("movie_id")["rating"].count().sort_values(ascending=False)
movies_rating.head(5)

In [None]:
fig = plt.figure(figsize=(12, 8))
axes = fig.add_axes([0.1, 0.1, 1, 1])
plt.title("Rating per Movies", fontsize=20)
plt.xlabel("Movie", fontsize=20)
plt.ylabel("Rating Count", fontsize=20)
plt.plot(movies_rating.values)
plt.tick_params(labelsize=15)
axes.set_xticklabels([])
plt.show()

In [None]:
df["DayOfWeek"] = df.date.dt.day_name()

In [None]:
df.head(5)

## Analysis Rating on Day of Week

In [None]:
def labels(number):
    return str(number/10**6) + 'M'

plt.figure(figsize=(12, 8))
ax = sns.countplot(x='rating', data= df)
ax.set_yticklabels([labels(num) for num in ax.get_yticks()])

# plt.tick_params()
plt.title("Distribution of Rating", fontsize=20)
plt.xlabel("Rating", fontsize=20)
plt.ylabel("Rating Count(M)", fontsize=20)
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 8))

axes = sns.countplot(x="DayOfWeek", data = df)
axes.set_title("Day of Week VS Number of Rating", fontsize = 20)
axes.set_xlabel("Day of Week", fontsize=20)
axes.set_ylabel("Number of Rating", fontsize=20)
axes.set_yticklabels([labels(num) for num in ax.get_yticks()])

plt.show()

In [None]:
fig = plt.figure(figsize = (12, 8))

axes = sns.boxplot(x="DayOfWeek",y="rating", data=df)
axes.set_title("Day of Week VS Number of Rating", fontsize=20)
axes.set_xlabel("Day of Week", fontsize=20)
axes.set_ylabel("Number of Rating", fontsize=20)

plt.show()

In [None]:
avg_rating_dayofweek = df.groupby("DayOfWeek")["rating"].mean()

print("Average Rating on Day of Week")
print(avg_rating_dayofweek)

# Creating USER-ITEM sparse matrix

In [None]:
startTime = datetime.now()
print("Creating USER_ITEM sparse matrix for train Data")
if os.path.isfile("SparseData.npz"):
    print("Sparse Data is already present in your disk, no need to create further. Loading Sparse Matrix")
    SparseData = sparse.load_npz("SparseData.npz")
    print("Shape of Train Sparse matrix = "+str(SparseData.shape))
    
else:
    print("We are creating sparse data")
    SparseData = sparse.csr_matrix((df.rating, (df.user_id, df.movie_id)))
    print("Creation done. Shape of sparse matrix = "+str(SparseData.shape))
    print("Saving it into disk for furthur usage.")
    sparse.save_npz("SparseData.npz", SparseData)
    print("Done\n")

print(datetime.now() - startTime)

In [None]:
rows, cols = SparseData.shape
ele = SparseData.count_nonzero()

print("Sparse of Data: "+str((1-(ele/(rows*cols)))*100))

### Global Average Rating

In [None]:
global_avg_rating = SparseData.sum()/SparseData.count_nonzero()
global_avg_rating

### Average Rating

In [None]:
def getAverageRatings(sparseMatrix, if_user):
    ax = 1 if if_user else 0
    SumOfRatings = sparseMatrix.sum(axis=ax).A1
    NoOfRatings = (sparseMatrix!=0).sum(axis=ax).A1
    
    rows, cols = sparseMatrix.shape
    avg_ratings = {i: SumOfRatings[i]/NoOfRatings[i] for i in range(rows if if_user else cols) if NoOfRatings[i]!=0}
    return avg_ratings

In [None]:
avg_rating_user = getAverageRatings(SparseData, True)[25] #Average rating of user 25
avg_rating_user

In [None]:
avg_rating_movie = getAverageRatings(SparseData, False)[4500]
avg_rating_movie

## Computing User-User Similarity Matrix

In [None]:
row_index, col_index = SparseData.nonzero()
rows = np.unique(row_index)
for i in rows[:100]:
    print(i)

### Top 100 most similar users with first 100 users as above

In [None]:
def getUser_UserSimilarity(sparseMatrix, top = 100):
    startTimestamp20 = datetime.now()  
    
    row_index, col_index = sparseMatrix.nonzero()  
    rows = np.unique(row_index)
    similarMatrix = np.zeros(61700).reshape(617,100)   
    timeTaken = []
    howManyDone = 0
    for row in rows[:top]:
        howManyDone += 1
        startTimestamp = datetime.now().timestamp() 
        sim = cosine_similarity(sparseMatrix.getrow(row), sparseMatrix).ravel()
        top100_similar_indices = sim.argsort()[-top:]
        top100_similar = sim[top100_similar_indices]
        similarMatrix[row] = top100_similar
        timeforOne = datetime.now().timestamp() - startTimestamp
        timeTaken.append(timeforOne)
        if howManyDone % 20 == 0:
            print("Time elapsed for {} users = {}sec".format(howManyDone, (datetime.now() - startTimestamp20)))
    print("Average Time taken to compute similarity matrix for 1 user = "+str(sum(timeTaken)/len(timeTaken))+"seconds")
    
    fig = plt.figure(figsize = (12,8))
    plt.plot(timeTaken, label = 'Time Taken For Each User')
    plt.plot(np.cumsum(timeTaken), label='Cumulative Time')
    plt.legend(loc='upper left', fontsize = 15)
    plt.xlabel('Users', fontsize = 20)
    plt.ylabel('Time(Seconds)', fontsize = 20)
    plt.tick_params(labelsize = 15)
    plt.show()
    
    return similarMatrix

In [None]:
simMatrix = getUser_UserSimilarity(SparseData, 100)

## Computing Movie-Movie Similarity Matrix

In [None]:
start = datetime.now()

if not os.path.isfile(".movie_similarity.npz"):
    print("Movie-Movie Similarity file does not exist in your disk. Creating Movie-Movie Similarity Matrix...")
    
    movie_similarity = cosine_similarity(SparseData.T, dense_output = False)
    print("Done")
    print("Dimension of Matrix = {}".format(movie_similarity.shape))
    print("Storing the Movie Similarity matrix on disk for further usage")
    sparse.save_npz("movie_similarity.npz", movie_similarity)
else:
    print("File exists in the disk. Loading the file...")
    movie_similarity = sparse.load_npz("movie_similarity.npz")
    print("Dimension of Matrix = {}".format(movie_similarity.shape))
    
print(datetime.now() - start)

## checking the top 10 most similary movies.

In [None]:
movie_id = np.unique(movie_similarity.nonzero())

simMovie_dict = dict()
for movie in movie_id:
    sim = np.argsort(-(movie_similarity[movie]).toarray().ravel())[1:100]
    simMovie_dict[movie] = sim

In [None]:
movie_title = pd.read_csv('../input/netflix-prize-data/movie_titles.csv', sep=",",
                         header=None, names=["movie_id", "year_of_release", "movie_title"], index_col="movie_id", encoding="iso8859_2")

In [None]:
movie_title.head(10)

### Similar movies to: Pressure

In [None]:
movieID = 177
print("Name of the movie: " +str(movie_title.loc[movieID][1]))
print("Number of ratings by users for movie {} is {}".format(movie_title.loc[movieID][1], 
                                                            SparseData[:,movieID].getnnz()))
print("Number of similar movies to {} is {}".format(movie_title.loc[movieID][1], movie_similarity[movieID].count_nonzero()))

movieID = 8000
print("Name of the movie: " +str(movie_title.loc[movieID][1]))
print("Number of ratings by users for movie {} is {}".format(movie_title.loc[movieID][1], 
                                                            SparseData[:,movieID].getnnz()))
print("Number of similar movies to {} is {}".format(movie_title.loc[movieID][1], movie_similarity[movieID].count_nonzero()))

In [None]:
all_similar = sorted(movie_similarity[movieID].toarray().ravel(), reverse=True)[1:]

similar_100 = all_similar[:101]

In [None]:
plt.figure(figsize = (10, 8))
plt.plot(all_similar, label = "All Similar")
plt.plot(similar_100, label = "Top 100 Similar Movies")
plt.title("Similar Movies to Pressure", fontsize = 25)
plt.ylabel("Cosine Similarity Values", fontsize = 20)
plt.legend(fontsize = 20)

plt.show()

## Top 10 Similar Movies to: Pressure

In [None]:
movie_title.loc[simMovie_dict[movieID][:10]]

# Machine Learning Model

take only a smaller dataset of 2,000 top rated movies and 10,000 top users. a new dataframe is shown below.


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
new_df = pd.DataFrame()

group = df.groupby('user_id')['rating'].count()
top_user = group.sort_values(ascending=False)[:10000]

group = df.groupby('movie_id')['rating'].count()
top_movie = group.sort_values(ascending=False)[:2000]

new_df = df.join(top_user, rsuffix="_r", how="inner", on="user_id")
new_df = new_df.join(top_movie, rsuffix="_r", how="inner", on="movie_id")
                     
user_enc = LabelEncoder()
new_df['user'] = user_enc.fit_transform(new_df['user_id'].values)

movie_enc = LabelEncoder()
new_df['movie'] = movie_enc.fit_transform(new_df['movie_id'].values)

movies = new_df['movie'].nunique()
users = new_df['user'].nunique()

new_df
                                        

In [None]:
if not os.path.isfile("TrainData.pkl"):
    new_df.iloc[:int(new_df.shape[0]*0.80)].to_pickle("TrainData.pkl")
    Train_Data = pd.read_pickle("TrainData.pkl")
    Train_Data.reset_index(drop = True, inplace = True)
else:
    Train_Data = pd.read_pickle("TrainData.pkl")
    Train_Data.reset_index(drop = True, inplace = True)

if not os.path.isfile("TestData.pkl"):
    new_df.iloc[int(new_df.shape[0]*0.80):].to_pickle("TestData.pkl")
    Test_Data = pd.read_pickle("TestData.pkl")
    Test_Data.reset_index(drop = True, inplace = True)
else:
    Test_Data = pd.read_pickle("TestData.pkl")
    Test_Data.reset_index(drop = True, inplace = True)

In [None]:
Train_Data.head()

In [None]:
def get_sample_sparse_matrix(sparseMatrix, n_users, n_movies):
    startTime = datetime.now()
    users, movies, ratings = sparse.find(sparseMatrix)
    uniq_users = np.unique(users)
    uniq_movies = np.unique(movies)
    np.random.seed(15)   #this will give same random number everytime, without replacement
    userS = np.random.choice(uniq_users, n_users, replace = False)
    movieS = np.random.choice(uniq_movies, n_movies, replace = False)
    mask = np.logical_and(np.isin(users, userS), np.isin(movies, movieS))
    sparse_sample = sparse.csr_matrix((ratings[mask], (users[mask], movies[mask])), 
                                                     shape = (max(userS)+1, max(movieS)+1))
    print("Sparse Matrix creation done. Saving it for later use.")
    sparse.save_npz(path, sparse_sample)
    print("Done")
    print("Shape of Sparse Sampled Matrix = "+str(sparse_sample.shape))
    
    print(datetime.now() - start)
    return sparse_sample

In [None]:
startTime = datetime.now()
print("Creating USER_ITEM sparse matrix for train Data")
if os.path.isfile("TrainUISparseData.npz"):
    print("Sparse Data is already present in your disk, no need to create further. Loading Sparse Matrix")
    TrainUISparseData = sparse.load_npz("TrainUISparseData.npz")
    print("Shape of Train Sparse matrix = "+str(TrainUISparseData.shape))
    
else:
    print("We are creating sparse data")
    TrainUISparseData = sparse.csr_matrix((Train_Data.rating, (Train_Data.user_id, Train_Data.movie_id)))
    print("Creation done. Shape of sparse matrix = "+str(TrainUISparseData.shape))
    print("Saving it into disk for furthur usage.")
    sparse.save_npz("TrainUISparseData.npz", TrainUISparseData)
    print("Done\n")

print(datetime.now() - startTime)

In [None]:
path = "TrainUISparseData_Sample.npz"
if not os.path.isfile(path):
    print("Sample sparse matrix is not present in the disk. We are creating it...")
    train_sample_sparse = get_sample_sparse_matrix(TrainUISparseData, 4000, 400)
else:
    print("File is already present in the disk. Loading the file...")
    train_sample_sparse = sparse.load_npz(path)
    print("File loading done.")
    print("Shape of Train Sample Sparse Matrix = "+str(train_sample_sparse.shape))

In [None]:
startTime = datetime.now()
print("Creating USER_ITEM sparse matrix for test Data")
if os.path.isfile("TestUISparseData.npz"):
    print("Sparse Data is already present in your disk, no need to create further. Loading Sparse Matrix")
    TestUISparseData = sparse.load_npz("TestUISparseData.npz")
    print("Shape of Test Sparse matrix = "+str(TestUISparseData.shape))
    
else:
    print("We are creating sparse data")
    TestUISparseData = sparse.csr_matrix((Test_Data.rating, (Test_Data.user_id, Test_Data.movie_id)))
    print("Creation done. Shape of sparse matrix = "+str(TestUISparseData.shape))
    print("Saving it into disk for furthur usage.")
    sparse.save_npz("TestUISparseData.npz", TestUISparseData)
    print("Done\n")

print(datetime.now() - startTime)

In [None]:
path = "TestUISparseData_Sample.npz"
if not os.path.isfile(path):
    print("Sample sparse matrix is not present in the disk. We are creating it...")
    test_sample_sparse = get_sample_sparse_matrix(TestUISparseData, 4000, 400)
else:
    print("File is already present in the disk. Loading the file...")
    test_sample_sparse = sparse.load_npz(path)
    print("File loading done.")
    print("Shape of Train Sample Sparse Matrix = "+str(test_sample_sparse.shape))

In [None]:
print("No of ratings in Our Sampled train matrix is : {}".format(train_sample_sparse.count_nonzero()))
print("No of ratings in Our Sampled test matrix is : {}".format(test_sample_sparse.count_nonzero()))

In [None]:
reader = Reader(rating_scale=(1,5))

data = Dataset.load_from_df(new_df[['user_id', 'movie_id', 'rating']], reader)

trainset = data.build_full_trainset()

In [None]:
testset = list(zip(new_df["user_id"].values, new_df["movie_id"].values, new_df["rating"].values))

In [None]:
testset[:10]

In [None]:
error_table = pd.DataFrame(columns = ["Model", "Train RMSE", "Train MAPE", "Test RMSE", "Test MAPE"])
model_train_evaluation = dict()
model_test_evaluation = dict()

In [None]:
def make_table(model_name, rsme_train, mape_train, rsme_test, mape_test):
    global error_table
    error_table = error_table.append(pd.DataFrame([[model_name, rsme_train, mape_train, rsme_test, mape_test]],
                                                 columns = ["Model", "Train RMSE", "Train MAPE", "Test RMSE", "Test MAPE"]))
    error_table.reset_index(drop=True, inplace=True)

## Utility Functions for Surprise Models

In [None]:
def error_matrics(y_true, y_pred):
    rsme = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(abs((y_true - y_pred)/y_true))*100
    return rmse, mape

In [None]:
def get_ratings(predictions):
    actual = np.array([pred.r_ui for pred in predictions])
    predicted = np.array([pred.est for pred in predictions])
    return actual, predicted

In [None]:
def get_error(predictions):
    actual, predicted = get_ratings(predictions)
    rmse = np.sqrt(mean_squared_error(actual, predicted)) 
    mape = np.mean(abs((actual - predicted)/actual))*100
    return rmse, mape

In [None]:
my_seed = 15
random.seed(my_seed)
np.random.seed(my_seed)

def run_surprise(algo, trainset, testset, model_name):
    startTime = datetime.now()
    
    train = dict()
    test = dict()
    
    algo.fit(trainset)
    
    print("-"*50)
    print("TRAIN DATA")
    train_pred = algo.test(trainset.build_testset())
    
    train_actual, train_predicted = get_ratings(train_pred)
    train_rmse, train_mape = get_error(train_pred)
    print("RMSE = {}".format(train_rmse))
    print("MAPE = {}".format(train_mape))
    print("-"*50)
    train = {"RMSE": train_rmse, "MAPE": train_mape, "Prediction": train_predicted}
    
    print("TEST DATA")
    test_pred = algo.test(testset)

    test_actual, test_predicted = get_ratings(test_pred)
    test_rmse, test_mape = get_error(test_pred)
    print("RMSE = {}".format(test_rmse))
    print("MAPE = {}".format(test_mape))
    print("-"*50)
    test = {"RMSE": test_rmse, "MAPE": test_mape, "Prediction": test_predicted}
    
    print("Time Taken = "+str(datetime.now() - startTime))
    
    make_table(model_name, train_rmse, train_mape, test_rmse, test_mape)
    
    return train, test

In [None]:
X = new_df[['user_id', 'movie_id']].values
y = new_df['rating'].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
bsl_options = {"method":"sgd", "learning_rate":0.01, "n_epochs":25}

algo = BaselineOnly(bsl_options=bsl_options)
#You can check the docs of above used functions at:https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#baseline-estimates-configuration
#at section "Baselines estimates configuration".

train_result, test_result = run_surprise(algo, trainset, testset, "BaselineOnly")

model_train_evaluation["BaselineOnly"] = train_result
model_test_evaluation["BaselineOnly"] = test_result

sim_options = {'name':'pearson_baseline', 'user_based':True, 'min_support':2, 'shrinkage':60}

bsl_options = {'method': 'sgd'} 

algo = KNNBaseline(k = 10, sim_options = sim_options, bsl_options=bsl_options)

train_result, test_result = run_surprise(algo, trainset, testset, "KNNBaseline_User")

model_train_evaluation["KNNBaseline_User"] = train_result
model_test_evaluation["KNNBaseline_User"] = test_result

In [None]:
algo = SVD(n_factors = 5, biased=True, verbose=True)

train_result, test_result = run_surprise(algo, trainset, testset, "SVD")

model_train_evaluation["SVD"] = train_result
model_test_evaluation["SVD"] = test_result

In [None]:
algo = SVDpp(n_factors = 10, lr_all = 0.006, verbose=True)

train_result, test_result = run_surprise(algo, trainset, testset, "SVDpp")

model_train_evaluation["SVDpp"] = train_result
model_test_evaluation["SVDpp"] = test_result

In [None]:
sim_options = {'name':'pearson_baseline', 'user_based':True, 'min_support':2, 'shrinkage':10}

bsl_options = {'method': 'sgd'} 

algo = KNNBaseline(k = 1, sim_options = sim_options, bsl_options=bsl_options)

train_result, test_result = run_surprise(algo, trainset, testset, "KNNBaseline_Item")

model_train_evaluation["KNNBaseline_Item"] = train_result
model_test_evaluation["KNNBaseline_Item"] = test_result

sim_options = {'name':'pearson_baseline', 'user_based':True, 'min_support':2, 'shrinkage':10}

bsl_options = {'method': 'sgd'} 

algo = KNNBaseline(k = 1, sim_options = sim_options, bsl_options=bsl_options)

train_result, test_result = run_surprise(algo, trainset, testset, "KNNBaseline_Item")

model_train_evaluation["KNNBaseline_Item"] = train_result
model_test_evaluation["KNNBaseline_Item"] = test_result