# Notebook demonstrating the Implemented Bellkor Algorithm

Date: 07/01/2018

In [1]:
import datetime
import logging.config
import os
import time

import numpy as np
import pandas as pd

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("BellKorDemo")

In [3]:
# location of the ml-20 dataset
#    for this demo I've downloaded and placed the data in the same location as this Notebook
data_location = os.getcwd()

In [4]:
# Get all the .csv files in the 'data_location'
datasets = [f for f in os.listdir(data_location) if f.endswith(".csv")]
print("Files in the data location: ", datasets)

Files in the data location:  ['tags.csv', 'links.csv', 'ratings.csv', 'movies.csv']


#### Load each of the Datasets into Scope

In [5]:
tags = pd.read_csv(f"{data_location}/tags.csv")

In [6]:
tags.head(n=3)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079


In [7]:
links = pd.read_csv(f"{data_location}/links.csv")

In [8]:
links.head(n=3)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


In [9]:
ratings = pd.read_csv(f"{data_location}/ratings.csv")

In [10]:
ratings.head(n=3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819


In [11]:
movies = pd.read_csv(f"{data_location}/movies.csv")

In [12]:
movies.head(n=3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


## Prepare the data for the Bellkor Algorithm

For this demo I'm not splitting the dataset into the typical Train / Test / Validation datasets as you'd normally expect when doing ML. The reason for this is to demonstrate the algorithm and not ML.

    1.  Find the Global Mean
    2.  Number of Users
    3.  Number of Users
    4.  Start Time, End Time
    5.  Get the Average time mapping per User


In [13]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

##### 1. Finding the Global Mean

In [14]:
global_mean = ratings["rating"].mean()
print(f"Global Mean Rating: {global_mean} starts")

Global Mean Rating: 3.5255285642993797 starts


##### 2. Number of Users

In [15]:
distinct_users = ratings["userId"].unique()
user_count = distinct_users.shape[0]
print(f"Number of Users found: {user_count}")

Number of Users found: 138493


In [16]:
# Remap to an index starting from 0
user_mapping = dict(zip(distinct_users, np.arange(user_count), strict=False))

In [17]:
ratings.loc[:, "UserId"] = ratings.loc[:, "userId"].apply(lambda x: user_mapping[x])

##### 3.  Number of Movies

In [18]:
distinct_movies = ratings["movieId"].unique()
movie_count = distinct_movies.shape[0]
print(f"Number of Movies found: {movie_count}")

Number of Movies found: 26744


In [19]:
# Remap to an index starting from 0
movie_mapping = dict(zip(distinct_movies, np.arange(movie_count), strict=False))

In [20]:
ratings.loc[:, "MovieId"] = ratings.loc[:, "movieId"].apply(lambda x: movie_mapping[x])

##### 4.  Start Time, End Time

In [21]:
start_time = ratings["timestamp"].min()
end_time = ratings["timestamp"].max()
print(f"Start Timestamp {start_time}, End Timestamp {end_time}")

Start Timestamp 789652004, End Timestamp 1427784002


In [22]:
print(datetime.datetime.fromtimestamp(start_time))

1995-01-09 11:46:44


In [23]:
print(datetime.datetime.fromtimestamp(end_time))

2015-03-31 07:40:02


In [24]:
adjusted_start_day = int(time.mktime(datetime.datetime.fromtimestamp(start_time).date().timetuple()))

In [25]:
adjusted_end_day = int(time.mktime(datetime.datetime.fromtimestamp(end_time).date().timetuple())) + 86400

##### 5.  Get the Average time mapping per User

In [26]:
average_df = ratings.groupby("UserId")["timestamp"].mean().reset_index()

In [27]:
average_df.head(n=3)

Unnamed: 0,UserId,timestamp
0,0,1107833000.0
1,1,974820800.0
2,2,945006300.0


In [28]:
average_times = pd.Series(average_df.timestamp.values, index=average_df.UserId).to_dict()

# View Dataset

Re-mapped UserID and MovieId, Date as Date w.o time as a timestamp

In [29]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
UserId       int64
MovieId      int64
dtypes: float64(1), int64(5)
memory usage: 915.5 MB


In [30]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp,UserId,MovieId
count,20000260.0,20000260.0,20000260.0,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529,1100918000.0,69044.87,1777.707
std,40038.63,19789.48,1.051989,162169400.0,40038.63,2286.671
min,1.0,1.0,0.5,789652000.0,0.0,0.0
25%,34395.0,902.0,3.0,966797700.0,34394.0,376.0
50%,69141.0,2167.0,3.5,1103556000.0,69140.0,1063.0
75%,103637.0,4770.0,4.0,1225642000.0,103636.0,2225.0
max,138493.0,131262.0,5.0,1427784000.0,138492.0,26743.0


In [31]:
# Checking if any nulls or NaN exist in any of the columns
ratings.count() == ratings.shape[0]

userId       True
movieId      True
rating       True
timestamp    True
UserId       True
MovieId      True
dtype: bool

In [32]:
ratings.head(n=5)

Unnamed: 0,userId,movieId,rating,timestamp,UserId,MovieId
0,1,2,3.5,1112486027,0,0
1,1,29,3.5,1112484676,0,1
2,1,32,3.5,1112484819,0,2
3,1,47,3.5,1112484727,0,3
4,1,50,3.5,1112484580,0,4


In [33]:
ratings.tail(n=5)

Unnamed: 0,userId,movieId,rating,timestamp,UserId,MovieId
20000258,138493,68954,4.5,1258126920,138492,1814
20000259,138493,69526,4.5,1259865108,138492,1037
20000260,138493,69644,3.0,1260209457,138492,3950
20000261,138493,70286,5.0,1258126944,138492,1818
20000262,138493,71619,2.5,1255811136,138492,4010


# Bellkor Algorithm

    1. Initilisation
    2. Prepare dataset (done)
    3. Train Model
    4. Basic stats on the Parameters
    5. Prediction
    6. Save results

##### 1. Initialisation

In [34]:
from Bellkor.Algorithm import BellkorAlgorithm

In [35]:
print(BellkorAlgorithm.__doc__)

 Implementation of the Original Bellkor Algorithm as presented here:

            Link: https://netflixprize.com/assets/GrandPrize2009_BPC_BellKor.pdf

    


In [36]:
print(BellkorAlgorithm.__init__.__doc__)

None


In [37]:
calibrator = BellkorAlgorithm(
    n_items=movie_count,
    n_users=user_count,
    global_mean=global_mean,
    time_setting=dict(Start=adjusted_start_day, End=adjusted_end_day),
)

2018-01-07 20:03:26 - Bellkor.Utils.Decorators.timeit - INFO - Method: __init__ - 3.133376359939575 seconds


##### 2. Prepare dataset

expected columns: [Index, TimePeriod, User, Item, BaseRating]

In [38]:
ratings.loc[:, "Index"] = ratings.index

In [39]:
ratings.head(n=3)

Unnamed: 0,userId,movieId,rating,timestamp,UserId,MovieId,Index
0,1,2,3.5,1112486027,0,0,0
1,1,29,3.5,1112484676,0,1,1
2,1,32,3.5,1112484819,0,2,2


In [40]:
X = ratings.loc[:, ["Index", "timestamp", "UserId", "MovieId", "rating"]].as_matrix()

In [41]:
# show a few rows
print(X[0:3, :])

[[0.00000000e+00 1.11248603e+09 0.00000000e+00 0.00000000e+00
  3.50000000e+00]
 [1.00000000e+00 1.11248468e+09 0.00000000e+00 1.00000000e+00
  3.50000000e+00]
 [2.00000000e+00 1.11248482e+09 0.00000000e+00 2.00000000e+00
  3.50000000e+00]]


##### 3. Train the Model

In [42]:
# When you running this properly increase the sample_size and number of iterations
cost, error = calibrator.train(x=X, average_times=average_times, sample_size=10, iterations=10)

2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Running Epoch: 1
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Epoch: 0 took: 0.0035970211029052734s
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Running Epoch: 2
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Epoch: 1 took: 0.0024919509887695312s
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Running Epoch: 3
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Epoch: 2 took: 0.0022182464599609375s
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Running Epoch: 4
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Epoch: 3 took: 0.0029098987579345703s
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Running Epoch: 5
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Epoch: 4 took: 0.0027086734771728516s
2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.train - INFO - Running Epoch: 6
2018-01-07 20:03:29 - Bell

# Review the Parameters

###### 4. Basic stats on the Parameters

In [43]:
# TODO

###### 5. Prediction

In [44]:
preds = calibrator.predict(x=X[0:1000, :], average_times=average_times)

2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.predict - INFO - Prediction Time took: 0.06701946258544922s
2018-01-07 20:03:29 - Bellkor.Utils.Decorators.timeit - INFO - Method: predict - 0.06832075119018555 seconds


In [45]:
predictions = pd.DataFrame(data=preds, columns=["Index", "Prediction"])
predictions.head(n=3)

Unnamed: 0,Index,Prediction
0,0.0,3.676115
1,1.0,3.675946
2,2.0,3.676028


###### 6. Save results

In [46]:
calibrator.pickle_parameters(file_name="DEMO_OUTPUT")

2018-01-07 20:03:29 - BellKor.BellkorAlgorithm.pickle_parameters - INFO - Save the Parameters @ /home/dan/PycharmProjects/BellkorAlgorithm/resources/MODEL_PARAMS/DEMO_OUTPUT.pickle
2018-01-07 20:03:29 - Bellkor.Utils.Decorators.timeit - INFO - Method: pickle_parameters - 0.0007581710815429688 seconds


'/home/dan/PycharmProjects/BellkorAlgorithm/resources/MODEL_PARAMS/DEMO_OUTPUT.pickle'

# Concluding Remarks

In [47]:
# TODO