In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import sys
sys.path.append("..")

In [7]:
import surprise as sur
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import pandas as pd
import numpy as np
from collections import defaultdict
import item_item as ii  # in ../

In [8]:
import date_library as dlib  # in ../

In [9]:
def dates_year(df, year):
    first = year + '-01-01'
    last = year + '-12-31'
    return df[(df['date'] >= first) & (df['date'] <= last)]

## Convert items and uses to integers ranging from [0,n] and [0,m]

In [10]:
df = pd.read_csv("member_d.csv")
df.columns = ['userID', 'itemID', 'flight_date', 'family_size']
max_rating = 5
#df['rating'] = df['rating'].clip(lower=0., upper=max_rating)
df['rating'] = 1
reader = Reader(rating_scale=[1, max_rating])   # All ratings are 1
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

FileNotFoundError: [Errno 2] No such file or directory: 'member_d.csv'

In [9]:
dates = df.flight_date.values
# Date routines work properly in both directions
# it is not clear why the max date is Dec. 2022? That should not be!
print("dates.min: ", dlib.timestampToDateTimePTY(dates.min()))
print("dates.max: ", dlib.timestampToDateTimePTY(dates.max()))

# dlib.timestampToDateTimePTY(date)
new_dates = []
for date in dates:
    d = dlib.timestampToDateTimePTY(date)[0]
    new_dates.append(d)

# Choose dates in 2016
df['date'] = new_dates
df = df.sort_values('flight_date')
df;

dates.min:  ('2015-02-04', '05:00')
dates.max:  ('2022-12-30', '05:00')


In [10]:
df_years = {}
years = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
for year in years:
    df_years[year] = dates_year(df, year)
    print(f"year: {year}, nb records: {df_years[year].shape}")

year: 2015, nb records: (31703, 6)
year: 2016, nb records: (119873, 6)
year: 2017, nb records: (145362, 6)
year: 2018, nb records: (175231, 6)
year: 2019, nb records: (204299, 6)
year: 2020, nb records: (61658, 6)
year: 2021, nb records: (88174, 6)
year: 2022, nb records: (23191, 6)


In [11]:
dff = {}
for year in years:
    dff[year] = df_years[year].groupby(['userID','itemID']).size().to_frame('rating').reset_index()
    dff[year]['rating'] = 1 # Do not take nb of trips to a destination into account (YET)

In [44]:
for year in years:
    print("members_"+year)
    print(f"{year}: {dff[year].shape[0]} rows")
    dff[year].to_csv(f"members_{year}.csv", index=0)

members_2015
2015: 21605 rows
members_2016
2016: 62991 rows
members_2017
2017: 74658 rows
members_2018
2018: 88419 rows
members_2019
2019: 100967 rows
members_2020
2020: 45284 rows
members_2021
2021: 53372 rows
members_2022
2022: 19854 rows


In [48]:
reader = Reader(rating_scale=[1, 1])

data_training = {}

for year in years:
    data_training[year] = Dataset.load_from_df(dff[year][['userID', 'itemID', 'rating']], reader)
    print(f"Raw ratings: {data_training[year].df.shape[0]}")

Raw ratings: 21605
Raw ratings: 62991
Raw ratings: 74658
Raw ratings: 88419
Raw ratings: 100967
Raw ratings: 45284
Raw ratings: 53372
Raw ratings: 19854


In [51]:
d = {}
for year in years:
    d[year] = data_training[year].build_full_trainset()
    print(f"{year}: {len(d[year].ur.keys())}, {len(d[year].ir.keys())}")

2015: 11313, 74
2016: 23562, 76
2017: 28011, 75
2018: 32883, 80
2019: 37718, 81
2020: 30444, 80
2021: 31138, 74
2022: 15621, 75


In [14]:
# Do all members have a destination? I do not think so
g = dff['2017'].groupby('userID')
g.groups.items()
for k, items in  g.groups.items():
    nb_items = len(items)
    if nb_items == 0:
        print("no items")
print("nb users: ", len(g.groups.keys()))

nb users:  28011


In [15]:
# matches the total number of destinations computed below
for year in years:
    all_ratings = d[year].all_ratings()
    print(f"Year: {year}, {len(list(all_ratings))}")

Year: 2015, 21605
Year: 2016, 62991
Year: 2017, 74658
Year: 2018, 88419
Year: 2019, 100967
Year: 2020, 45284
Year: 2021, 53372
Year: 2022, 19854


In [16]:
# Count the number of items with no destinations. Should be zero.
if False:
    for year in years:
        print("==> ", year)
        ii.count_zero_items(d[year], verbose=False)

In [17]:
# Size to store user-item data in full arrays: 
total = 0
for year in years:
    sz = d[year].n_users * d[year].n_items
    total += sz
    # print("year: ",  year, ", ", d[year].n_users * d[year].n_items)
# print("total: ", total)

### Construct user-item matrix, with each row normalized to 1

In [18]:
%%time
for year in years:
    ii.user_item_row_norms(d[year])

CPU times: user 240 ms, sys: 3.82 ms, total: 244 ms
Wall time: 243 ms


In [19]:
%%time
for year in years:
    ii.user_item_col_norms(d[year])

CPU times: user 608 µs, sys: 0 ns, total: 608 µs
Wall time: 609 µs


# Construct user-item matrix as a numpy array

In [20]:
%%time
r_norms = {}; r_norms_inv_sq = {}
c_norms = {}; c_norms_inv = {}
for year in years:
    r_norms[year], r_norms_inv_sq[year] = ii.user_item_row_norms(d[year])
    c_norms[year], c_norms_inv[year] = ii.user_item_col_norms(d[year])

CPU times: user 245 ms, sys: 0 ns, total: 245 ms
Wall time: 245 ms


In [21]:
%%time
simil_matrix = {}
user_item = {}
user_item_row_normalized = {}
simil_matrix_row_normalized = {}

for year in years:
    print(year)
    user_item[year] = ii.user_item_matrix(d[year])
    simil_matrix[year] = ii.sim_matrix(user_item[year])
    
    user_item_row_normalized[year] = ii.user_item_matrix_row_normalized(user_item[year], r_norms[year])
    simil_matrix_row_normalized[year] = ii.sim_matrix(user_item_row_normalized[year])

2015
2016
2017
2018
2019
2020
2021
2022
CPU times: user 12.6 s, sys: 199 ms, total: 12.8 s
Wall time: 898 ms


## Select recommendations
The plan is to select users at random and provide N top recommendations.  This will require translating items recommended back to destinations.  I will have to figure out how to evaluate the quality of the recommendations.  I will probably use the Hit Rate (HR) metric since it works and is simple to implement. 

## Destinations traveled the following year
* Store destinations travelled by members in 2017
* Compare these destinations with the predictions made on the basis of 2016

* Future plans: Add attributes to the trips: 
    * country of origin
    * gender
    * family size
    * cost of trip
    * month flown
    * trip duration
    * type of trip (can probably be determined from family size and frequency)
    
* Note: should we take into account travel of members going to many places per month? Is it likely that the recommender will have an effect? 

In [22]:
# data from 2017 (one year past the training data): stored in dtest (created by Surprise)
d['2016'].all_users(), d['2017'].all_users()

(range(0, 23562), range(0, 28011))

In [23]:
members = {}
for year in years:
    members[year] = ii.get_raw_users(d, year)
    # print(f"Year: {year}, nb members: {len(members[year])}")

In [24]:
# Intersect these two sets to find the members in 2017 that flew in 2016, and for which I have preditions
users_common = members['2016'].intersection(members['2017'])
len(users_common)

20535

In [25]:
ii.predictions(members, d,  simil_matrix, train_year='2017', test_year='2018', verbose=False)

Percentage with a correct prediction 2018 based on 2017: 7174/24653: 0.2909990670506632


In [26]:
years4 = years[0:5]
years4

['2015', '2016', '2017', '2018', '2019']

In [27]:
for i1 in range(0, len(years4)):
    for i2 in range(i1+1, len(years4)):
        ii.predictions(members, d, simil_matrix, train_year=years4[i1], test_year=years4[i2], verbose=False)

Percentage with a correct prediction 2016 based on 2015: 3503/10152: 0.34505516154452326
Percentage with a correct prediction 2017 based on 2015: 3451/9938: 0.34725296840410547
Percentage with a correct prediction 2018 based on 2015: 3527/9971: 0.35372580483401866
Percentage with a correct prediction 2019 based on 2015: 3488/9938: 0.3509760515194204
Percentage with a correct prediction 2017 based on 2016: 6117/20535: 0.297881665449233
Percentage with a correct prediction 2018 based on 2016: 6722/20705: 0.32465588022216857
Percentage with a correct prediction 2019 based on 2016: 6913/20674: 0.334381348553739
Percentage with a correct prediction 2018 based on 2017: 7174/24653: 0.2909990670506632
Percentage with a correct prediction 2019 based on 2017: 7555/24640: 0.3066152597402597
Percentage with a correct prediction 2019 based on 2018: 8299/29044: 0.28573887894229444


In [30]:
# Predict past from the future
# Will the results be the similar?
for i1 in range(0, len(years4)):
    for i2 in range(i1+1, len(years4)):
        ii.predictions(members, d, simil_matrix, train_year=years4[i2], test_year=years4[i1], verbose=False)

Percentage with a correct prediction 2015 based on 2016: 1967/10152: 0.19375492513790385
Percentage with a correct prediction 2015 based on 2017: 1982/9938: 0.19943650633930368
Percentage with a correct prediction 2015 based on 2018: 2040/9971: 0.2045933206298265
Percentage with a correct prediction 2015 based on 2019: 2098/9938: 0.21110887502515596
Percentage with a correct prediction 2016 based on 2017: 5587/20535: 0.27207207207207207
Percentage with a correct prediction 2016 based on 2018: 5916/20705: 0.2857280850036223
Percentage with a correct prediction 2016 based on 2019: 6021/20674: 0.291235368095192
Percentage with a correct prediction 2017 based on 2018: 6577/24653: 0.266782947308644
Percentage with a correct prediction 2017 based on 2019: 6950/24640: 0.2820616883116883
Percentage with a correct prediction 2018 based on 2019: 7693/29044: 0.26487398429968323


In [31]:
# Row normalization worsened the results on the first four results, but slightly improved
# the remainder. It improved things when there is one year difference. Made things worse if there 
# is more than one year difference. 
for i1 in range(0, len(years4)):
    for i2 in range(i1+1, len(years4)):
        ii.predictions(members, d, simil_matrix_row_normalized, train_year=years4[i1], test_year=years4[i2], verbose=False)

Percentage with a correct prediction 2016 based on 2015: 3507/10152: 0.34544917257683216
Percentage with a correct prediction 2017 based on 2015: 3431/9938: 0.34524049104447574
Percentage with a correct prediction 2018 based on 2015: 3516/9971: 0.35262260555611274
Percentage with a correct prediction 2019 based on 2015: 3484/9938: 0.35057355604749446
Percentage with a correct prediction 2017 based on 2016: 6208/20535: 0.3023131239347456
Percentage with a correct prediction 2018 based on 2016: 6850/20705: 0.330837961844965
Percentage with a correct prediction 2019 based on 2016: 7017/20674: 0.33941182161168615
Percentage with a correct prediction 2018 based on 2017: 7258/24653: 0.29440636028069606
Percentage with a correct prediction 2019 based on 2017: 7696/24640: 0.31233766233766236
Percentage with a correct prediction 2019 based on 2018: 8355/29044: 0.2876669880181793


## Normalize rows of Model matrix before computing similarities

## Use asymmetric similarities based on probability modeling

---

In [32]:
def calc(data):
    n_items = data.n_items
    dct = defaultdict(set)

    for i in range(n_items):
        users = data.ir[i]
        for user, _ in users:
            dct[i].add(user)
    return dct

In [33]:
def prob(data, dct):
    n_items = data.n_items
    sim1 = np.zeros([n_items, n_items])
    sim2 = np.zeros([n_items, n_items])
    for item1 in range(n_items):
        for item2 in range(n_items):
            nbij = len(dct[item1].intersection(dct[item2]))
            if nbij > 0:
                sim1[item1, item2] = nbij / len(dct[item2])
                sim2[item1, item2] = nbij / (len(dct[item1]) * len(dct[item2]))
    return sim1,  sim2

In [34]:
sim2a = {}
sim2b = {}
for year in years:
    dct = calc(d[year])
    sim2a[year], sim2b[year] = prob(d[year], dct)

In [35]:
# The columns of sim2a['2017'] / sim2b['2017'] are constant. 
# AS A RESULT, the prediction algorithm produces the same results
# for both matrices. 

In [36]:
# The results are slightly better than the symmetric version!
for i1 in range(0, len(years4)):
    for i2 in range(i1+1, len(years4)):
        ii.predictions(members, d, sim2a, train_year=years4[i1], test_year=years4[i2], verbose=False)

Percentage with a correct prediction 2016 based on 2015: 3652/10152: 0.35973207249802996
Percentage with a correct prediction 2017 based on 2015: 3652/9938: 0.367478365868384
Percentage with a correct prediction 2018 based on 2015: 3780/9971: 0.379099388225855
Percentage with a correct prediction 2019 based on 2015: 3821/9938: 0.384483799557255
Percentage with a correct prediction 2017 based on 2016: 6108/20535: 0.29744338933528125
Percentage with a correct prediction 2018 based on 2016: 6774/20705: 0.3271673508814296
Percentage with a correct prediction 2019 based on 2016: 7102/20674: 0.343523265937893
Percentage with a correct prediction 2018 based on 2017: 7145/24653: 0.289822739626009
Percentage with a correct prediction 2019 based on 2017: 7657/24640: 0.31075487012987013
Percentage with a correct prediction 2019 based on 2018: 8290/29044: 0.2854290042693844


In [37]:
# Very poor results
for i1 in range(0, len(years4)):
    for i2 in range(i1+1, len(years4)):
        ii.predictions(members, d, sim2b, train_year=years4[i1], test_year=years4[i2], verbose=False)

Percentage with a correct prediction 2016 based on 2015: 888/10152: 0.08747044917257683
Percentage with a correct prediction 2017 based on 2015: 854/9938: 0.08593278325618837
Percentage with a correct prediction 2018 based on 2015: 812/9971: 0.08143616487814663
Percentage with a correct prediction 2019 based on 2015: 785/9938: 0.07898973636546588
Percentage with a correct prediction 2017 based on 2016: 1713/20535: 0.08341855368882396
Percentage with a correct prediction 2018 based on 2016: 1786/20705: 0.08625935764308137
Percentage with a correct prediction 2019 based on 2016: 1840/20674: 0.0890006771790655
Percentage with a correct prediction 2018 based on 2017: 2354/24653: 0.09548533647020646
Percentage with a correct prediction 2019 based on 2017: 2375/24640: 0.09638798701298701
Percentage with a correct prediction 2019 based on 2018: 2318/29044: 0.07980994353394849


In [181]:
# implement the version with the power of alpha in denominator. It is hard to believe that I will do better. 
# TRY IT OUT. 

---

# Some data exploration

In [None]:
df = pd.read_csv("member_d.csv")
df.columns = ['userID', 'itemID', 'flight_date', 'rating']
max_rating = 5
df['rating'] = df['rating'].clip(lower=0., upper=max_rating)

## Number of trips to each destination of full time period (2015-2021)

In [None]:
dfg = df.groupby('itemID')
dfg.size().sort_values(ascending=False)

## Number of trips taken to pairs of destinations
* How to compute this efficiently? 

In [None]:
dfg = df.groupby('userID')
dfgg = dfg.groups


In [None]:
from collections import defaultdict
import numpy as np

In [None]:
df.columns

In [None]:
# all destinations
destinations = set()
for d in df['itemID'].values:
    destinations.add(d)
destinations = list(destinations)
print(destinations)

In [None]:
dfg = df.groupby('userID')
dfgg = dfg.groups
# dfgg.get_group[user] is a list of users

users = defaultdict(int)
for ix, k in enumerate(dfgg.keys()):
    g = dfg.get_group(k)
    users[k] = defaultdict(int)
    for d in g.itemID:
        users[k][d] = 1

In [None]:
dfd = df.groupby('itemID')
dfdd = dfd.groups
    
items = defaultdict(int)
for ix, k in enumerate(dfdd.keys()):
    g = df.iloc[dfdd[k]]
    items[k] = defaultdict(int)
    for d in g.userID:
        items[k][d] = 1

In [None]:
# Compute column-norms of similarity matrix (symmetric)
sim = defaultdict(float)   # sim[item1, item2]
norm = defaultdict(float)
for k, v in items.items():
    norm[k] = np.sqrt(len(v))  # L2-norm of columns of user-item matrix R

In [None]:
# Compute row-norms of user-item matrix R
# Objective: increase weighting of users going to fewer destinations
# A user going to all destinations should not contribute to the recommendations
row_norm = defaultdict(float)
for k, v in users.items():
    row_norm[k] = np.sqrt(len(v))  # L2-norm

In [None]:
# row_norm

In [None]:
%%time
# For each item, store the users that purchased it
# NOT USED
dfd = df.groupby('itemID')
dfdd = dfd.groups
    
# make rows of R unit length
items_norm = defaultdict(int)
for ix, k in enumerate(dfdd.keys()):  # dfdd[k]: list of users
    g = dfd.get_group(k)  # takes longer, easier to read
    items_norm[k] = defaultdict(int)
    for d in g.userID:
        items_norm[k][d] = 1 / row_norm[d]

In [None]:
items.keys()

* Next cell takes a long time (50k users and 80 destinations). I am working with dictionaries, which might explain the slowness. 
* However, there is strong sparsity, so the slowness is overdone. 
* Another reason might be be because 

In [None]:
destinations;

In [None]:
destinations1 = destinations[0:10]

In [None]:
%%time 
# Symmetric version

try:
    del sim
except:
    pass

sim = defaultdict(float)
items = destinations1

for d1 in items:
    print("d1: ", d1)
    for d2 in items:
        for k, v in users.items():
            sim[(d1,d2)] += v[d1] * v[d2]
        sim[(d1,d2)] /= (norm[d1] * norm[d2])
        
for d1 in items:
    sim[(d1,d1)] = 0.   # Remove diagonal elements

In [None]:
%%time
# Symmetric version (with row-normalization)

try:
    del sim_row_norm
except:
    pass

sim_row_norm = defaultdict(float)
items = destinations1

for d1 in items:
    print("d1: ", d1)
    for d2 in items:
        for k, v in users.items():            # xx = v[d1] * v[d2] / row_norm[v]**2
            # Division on the next line doubles execution time
            sim_row_norm[(d1,d2)] += (v[d1] * v[d2] / row_norm[k]**2)
        sim_row_norm[(d1,d2)] /= (norm[d1] * norm[d2])
        
for d1 in items:
    sim_row_norm[(d1,d1)] = 0.   # Remove diagonal elements

In [None]:
l = list(sim.items())
l.sort(key=lambda x: -x[1])
l[0:20]

In [None]:
l = list(sim_row_norm.items())
l.sort(key=lambda x: -x[1])
l[0:20]

In [None]:
for i in items:
    print(i, sim[('GEO',i)])

In [None]:
for i in items:
    print(i, sim_row_norm[('GEO',i)])

In [None]:
display(sim[('POA','GUA')])
display(sim[('GUA','POA')])
display(sim_row_norm[('POA','GUA')])
display(sim_row_norm[('GUA','POA')])

## Initial Test (see Surprise Documentation and example)

In [None]:
df = pd.read_csv("member_d.csv")
df.columns = ['userID', 'itemID', 'flight_date', 'rating']
max_rating = 5
df['rating'] = df['rating'].clip(lower=0., upper=max_rating)

In [None]:
reader = Reader(rating_scale=[1, 5])
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [None]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

In [None]:
# We'll use the famous SVD algorithm.
algo = sur.SVD()

In [None]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)

In [None]:
%%time
predictions = algo.test(testset)

In [None]:
%%time
# Then compute RMSE
sur.accuracy.rmse(predictions)

## Sort the dataframe by increasing flight_date

In [None]:
df_sorted = df.sort_values('flight_date')

In [None]:
df_sorted

### Convert to regular dates

In [None]:
import date_library as dlib

In [None]:
date = '2022-03-01'
d1 = dlib.dateTimePTYToTimestamp(date)
d2 = dlib.timestampToDateTimePTY(d1)
print("d1: ", d1)
print("d2: ", d2)

In [None]:

dates = df.flight_date.values
print(dates)
print(dates.min())
# Date routines work properly in both directions
# it is not clear why the max date is Dec. 2022? That should not be!
print("dates.min: ", dlib.timestampToDateTimePTY(dates.min()))
print("dates.max: ", dlib.timestampToDateTimePTY(dates.max()))

# dlib.timestampToDateTimePTY(date)
new_dates = []
for date in dates:
    d = dlib.timestampToDateTimePTY(date)[0]
    new_dates.append(d)
    
#new_dates.sort()
# new_dates

In [None]:
# Choose dates in 2016
df['date'] = new_dates
len(new_dates)
df = df.sort_values('flight_date')
df;

In [None]:
dates_2016 = df[(df['date'] > '2015-12-31') & (df['date'] < '2017-01-01')]

In [None]:
reader = Reader(rating_scale=[1, 5])

df_train = df_years['2016']
df_test = df_years['2017']
data_training = Dataset.load_from_df(df_train[['userID', 'itemID', 'rating']], reader)
data_testing = Dataset.load_from_df(df_test[['userID', 'itemID', 'rating']], reader)

In [None]:
trainset, x = train_test_split(data_training, test_size=.1)
y, testset = train_test_split(data_testing, test_size=.9)
# testset is a list (each element is 3 elements)

In [None]:
%%time 

# We'll use the famous SVD algorithm.
algo = sur.SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)

In [None]:
# Remove from test set all (user, item) combinations already contained in the training set
# List of (user, dest) in the training set
user_d_train = set()
uid = trainset.to_raw_uid
iid = trainset.to_raw_iid
for el in trainset.all_ratings():
    # print(u,i,r)
    # print(el)
    # print(uid(el[0]), iid(el[1]),  el[2])
    user_d_train.add((uid(el[0]), iid(el[1]), el[2]))
user_d_list = list(user_d_train)
print(len(user_d_train)), print(user_d_list[0:3])

In [None]:
new_test = set()
trained_test = set()
for el in testset:
    if el not in user_d_train:
        new_test.add(el)
    else:
        trained_test.add(el)
print("new test set: ", len(new_test))
print("trained test set: ", len(trained_test))
print("full test set: ", len(testset))

In [None]:
# Construct a list of predictions for 100 user ids with all destinations
# Remove all those already in the test set. The remainder will not have 
# the city chosen. I wonder what the predictions are. 
# Question: I am not sure whether the recommender should predict zero for the trips not taken. 
#    After all, who knows whether the member would have taken the trip if given the opportunity? 

In [None]:
# Collect destinations from training set
destinations = set()
for rating in trainset.all_items():
    destinations.add(trainset.to_raw_iid(rating))
destinations = list(destinations)
print(len(destinations), dest)

In [None]:
a = []
a.append((2,3,4))

In [None]:
users = set()
test_not_visited = []
for i in range(10):
    users.add(predictions[i].uid)
users = list(users)
for user in users:
    for d in destinations:
        pred = (user, d, 0.)
        test_not_visited.append(pred)        

In [None]:
%%time 
predictions = algo.test(new_test)  # 48% (FCP)
# predictions = algo.test(testset)  # 43% (FCP)
# predictions = algo.test(trained_test) # 66%  (FCP)

# I find that all the ratings are unity (more or less). That is because the 
# vast majority of ratings is unity, so the simplest approach is to make everything 
# unity. 

# CONCLUSION: I need more variability in the ratings. Or I should have some elements with 
# zero values. Alternatively, I must add metadata to both the cities and the members. 
predictions = algo.test(test_not_visited)  # 48% (FCP)



# Then compute RMSE
sur.accuracy.rmse(predictions)
sur.accuracy.fcp(predictions)

In [None]:
# Construct a test set with 10 members and look at all 
predictions

In [None]:
for p in predictions:
    if p[0] in users:
    # if abs(p[3] < 1.5):
        print(p[0], p[2], p[3])

In [None]:
trainset.n_users, y.n_users

In [None]:
# Compare number of users in test set against the number in the trainset
# I find more users in the test set. However, that means that some users in the test set
# are not in the train set. I assume that these uses are simply not considered by Surprise. 
for t in testset:
    users.add(t[0])
print(len(users), trainset.n_users)

## When considering time
* The training set is all records from 2016
* The testing set is all records from 2017

I assumed that a crude temporal estimation would improve the ratings. However, RMSE ratings
went from 015 to 0.12 (decrease in accuracy). Note that random guessing is NOT 50% (it is less than 50%) since we are likely dealing with uneven classes and the choice is not binary.
Random choice should give 1/80 = 1.2%. So we are much better than random.

In [None]:
trainset.n_ratings

In [None]:
len(testset)