In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

%matplotlib inline

# Races data

In [2]:
df_races = pd.read_csv('./../data/clean/races_features.csv', index_col='race')
len(df_races)

159

# Results data

In [3]:
from utils.get_processed_data import get_results_df

df_results = get_results_df(df_races)

Number of single results: 2000403
Number of individuals: 643055
Number of remaining single results: 945102
Number of remaining individuals: 399472
Number of races in results df: 156


In [8]:
df_results

Unnamed: 0,id,athlete,division,rankdiv,rankgender,rankoverall,swim,t1,bike,t2,...,racename,bib,country,date,fulldata,score,swim_score,run_score,bike_score,gender
0,1518207,u0,M75-79,1,1073,1209,2440,678,11150,469,...,IRONMAN 70.3 Xiamen,167,USA,2016-11-13,0,98.4,100.0,86.1,100.0,M
1,1518208,u1,MPRO,1,1,1,1569,209,7416,142,...,IRONMAN 70.3 Xiamen,4,AUS,2016-11-13,0,99.7,100.0,97.7,100.0,M
2,1518209,u2,M75-79,2,1141,1301,3539,679,13127,461,...,IRONMAN 70.3 Xiamen,1779,JPN,2016-11-13,0,97.1,77.2,100.0,90.7,M
3,1518210,u3,M65-69,1,181,202,1895,377,9944,243,...,IRONMAN 70.3 Xiamen,133,AUS,2016-11-13,0,99.9,100.0,100.0,99.7,M
4,1518211,u4,M80-84,1,1281,1475,3135,784,13528,726,...,IRONMAN 70.3 Xiamen,1780,JPN,2016-11-13,0,100.0,100.0,100.0,100.0,M
5,1518212,u5,MPRO,2,2,2,1715,183,7589,135,...,IRONMAN 70.3 Xiamen,1,GBR,2016-11-13,0,99.3,94.6,100.0,98.8,M
6,1518213,u6,M55-59,1,134,152,2150,350,9711,282,...,IRONMAN 70.3 Xiamen,1737,ESP,2016-11-13,0,99.6,100.0,98.3,99.0,M
7,1518214,u7,M18-24,1,67,77,2240,233,9166,174,...,IRONMAN 70.3 Xiamen,163,CHE,2016-11-13,0,99.0,85.6,98.9,98.0,M
8,1518215,u8,M65-69,2,270,302,2662,389,9964,262,...,IRONMAN 70.3 Xiamen,1778,CHN,2016-11-13,0,97.3,79.0,98.8,99.6,M
9,1518216,u9,FPRO,1,1,8,1953,210,8415,157,...,IRONMAN 70.3 Xiamen,33,ZAF,2016-11-13,0,100.0,100.0,100.0,100.0,F


# Filter results dataset to keep only prolific racers
#### To prevent cold-start problem keep only people with at least several different races.

In [4]:
from utils.get_processed_data import get_athletes_races_count

athlete_habits = get_athletes_races_count(df_results)
athlete_habits.head()

Unnamed: 0,athlete,n_different_races,n_races
0,u0,5,6
1,u1,16,25
2,u10,30,38
3,u100,5,6
4,u1000,1,1


In [5]:
# mininum of different races we want our athletes to have done
min_unique_race_count = 4

# What we are filtering with
max_count_races = 100 # probably similar names


valid_athletes = athlete_habits.loc[
    (athlete_habits['n_different_races']>=min_unique_race_count) 
    & (athlete_habits['n_races']<=max_count_races)
]

# use this df to filter original results data
df_results_filtered = df_results.loc[df_results['athlete'].isin(valid_athletes['athlete'])]
df_results_filtered = df_results_filtered.merge(valid_athletes, left_on="athlete", right_on="athlete", how="left")

# Anonimize entrants
user_hash = {}

for i,user in enumerate(df_results_filtered.athlete.unique()):
    user_hash[user] = f'u{i}'
df_results_filtered.loc[:, 'athlete'] = df_results_filtered.athlete.map(lambda x: user_hash[x])

print("Number of valid individuals:", len(valid_athletes))
print("Number of remaining single results:", len(df_results_filtered))
print("Number of races present in filtered results:", len(df_results_filtered.race.unique()))

Number of valid individuals: 45799
Number of remaining single results: 329329
Number of races present in filtered results: 156


### Update races df to only keep races present in filtered results df

In [6]:
df_races_for_model = df_races.loc[df_results_filtered.race.unique()]
print("Number of remaining races:", len(df_races_for_model))

Number of remaining races: 156


# Create dataframe with race count per athlete

In [7]:
results_races_count = (
    df_results_filtered
        .groupby(['athlete', 'race'])
        .size()
        .reset_index()
        .rename(columns={0: 'count'})
)

# Add demographics info for users (if we want to filter later)
gender = (df_results_filtered
     .groupby(['athlete', 'gender'])
     .size()
     .reset_index()
     .rename(columns={0: 'n'})
     .pivot(index='athlete', columns='gender', values='n')
     .idxmax(axis=1)
     .rename('gender')
)

country = (df_results_filtered
     .groupby(['athlete', 'country'])
     .size()
     .reset_index()
     .rename(columns={0: 'n'})
     .pivot(index='athlete', columns='country', values='n')
     .idxmax(axis=1)
     .rename('country')
)

results_races_count = results_races_count.merge(gender, left_on = 'athlete', right_on = 'athlete', how = 'left')
results_races_count = results_races_count.merge(country, left_on = 'athlete', right_on = 'athlete', how = 'left')

results_races_count.head()

Unnamed: 0,athlete,race,count,gender,country
0,u0,cozumel,1,M,USA
1,u0,liuzhou70.3,1,M,USA
2,u0,mardelplata,1,M,USA
3,u0,newzealand,1,M,USA
4,u0,xiamen70.3,2,M,USA


# Recommender class

In [8]:
class BaseRecommender:
    def __init__(self, model, matrix, items_info, name='Model'):
        self.name = name
        self.model = model
        self.matrix = matrix
        self.items_info = items_info
        self.items_info_reset = items_info.reset_index()
        

class ALSRecommender(BaseRecommender):
     def recommend(self, target, n=10, filterByField=False, valueToMatch=False):
        target_code = self.items_info.index.get_loc(target)
        similar = self.model.similar_items(target_code, len(self.items_info))
        
        df_distances = pd.DataFrame([
                [self.items_info_reset.loc[code, 'race'], distance] for (code,distance) in similar
            ], columns=['race', 'distance'])
        
        df_order = df_distances.merge(self.items_info, left_on='race', right_on='race', how='left')
        if filterByField:
            df_order = df_order.loc[df_order[filterByField] == valueToMatch]

        return df_order


# Alternative Least Square (ALS)

The data we have are implicit data (data gathered from the users behaviour, with no ratings or specific actions needed. It could be what items a user purchased, how many times they played a song or watched a movie, how long they’ve spent reading a specific article etc. The upside is that we have a lot more of this data, the downside is that it’s more noisy and not always apparent what it means.)

ALS has been developped specifically for implicit data

## Prepare data

In [11]:
import scipy.sparse as sparse

full_set = results_races_count.copy()
full_set_pivot = full_set.pivot(index='race', columns='athlete', values='count').fillna(0)
sparse_item_user = sparse.csr_matrix(full_set_pivot.values)

print("Matrix size:", sparse_item_user.shape)

Matrix size: (156, 45799)


### Sparcity of the matrix

In [12]:
matrix_size = sparse_item_user.shape[0]*sparse_item_user.shape[1] # Number of possible interactions in the matrix
num_raced = len(sparse_item_user.nonzero()[0]) # Number of items interacted with
sparsity = (1 - (num_raced/matrix_size))
sparsity

0.9639170265166466

In [14]:
# use local implicit library (has been compiled using latest github code)
from implicit_local.evaluation import train_test_split, mean_average_precision_at_k, ndcg_at_k
from implicit_local.als import AlternatingLeastSquares
from implicit_local.nearest_neighbours import CosineRecommender

train, test = train_test_split(sparse_item_user, train_percentage=0.8)

# Grid search

In [15]:
# change that if you want to re-run the Grid search
runTheGridSearch = False

In [16]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'confidence_factor': [1, 5, 10, 20, 30, 40], 
    'als__factors': [5, 8, 10, 13, 17, 21, 25, 30],
    'als__regularization': [1e-1, 1e-2, 1e-3, 1e-4]
}

params_combinations = list(ParameterGrid(param_grid))
print(f'{len(params_combinations)} combinations of hyperparameters will be tested')

if runTheGridSearch:
    als_evaluation = {}

    for i,params in enumerate(params_combinations):
        train_conf = (train * params['confidence_factor']).astype('double')
        model = AlternatingLeastSquares(factors=params['als__factors'], regularization=params['als__regularization'], iterations=50)
        model.fit(train_conf, show_progress=False)

        map5 = mean_average_precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=5, num_threads=4, show_progress=False)
        map10 = mean_average_precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4, show_progress=False)
        ndcg5 = ndcg_at_k(model, train.T.tocsr(), test.T.tocsr(), K=5, num_threads=4, show_progress=False)
        ndcg10 = ndcg_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4, show_progress=False)

        als_evaluation[i] = {
            'map@5': map5,
            'map@10': map10,
            'ndcg@5': ndcg5,
            'ndcg@10': ndcg10   
        } 

    evaluation_results = pd.DataFrame([
        [params_combinations[i]['confidence_factor'], 
        params_combinations[i]['als__factors'],
        params_combinations[i]['als__regularization'], 
        als_evaluation[i]['map@5'], 
        als_evaluation[i]['map@10'],
        als_evaluation[i]['ndcg@5'], 
        als_evaluation[i]['ndcg@10']
        ] 
        for i in range(len(params_combinations))
    ], columns=['confidence', 'n_factors', 'regularization', 'map@5', 'map@10', 'ndcg@5', 'ndcg@10'])

    evaluation_results.to_csv('validation-results/als_hyperparameters_tuning.csv', index=False)
    
else:
    evaluation_results = pd.read_csv('validation-results/als_hyperparameters_tuning.csv')

192 combinations of hyperparameters will be tested


In [17]:
evaluation_results.loc[evaluation_results['map@5'] == evaluation_results['map@5'].max()]

Unnamed: 0,confidence,n_factors,regularization,map@5,map@10,ndcg@5,ndcg@10
103,5,17,0.01,0.229828,0.257501,0.292178,0.350453


In [18]:
evaluation_results.loc[evaluation_results['ndcg@5'] == evaluation_results['ndcg@5'].max()]

Unnamed: 0,confidence,n_factors,regularization,map@5,map@10,ndcg@5,ndcg@10
103,5,17,0.01,0.229828,0.257501,0.292178,0.350453


In [19]:
# let's refine

if runTheGridSearch:
    best_params = evaluation_results.loc[evaluation_results['map@5'] == evaluation_results['map@5'].max()]

    param_grid_refined = {
        'confidence_factor': np.arange(best_params['confidence'].values[0]-2, best_params['confidence'].values[0]+3.5, 0.5), 
        'als__factors': np.arange(best_params['n_factors'].values[0]-3, best_params['n_factors'].values[0]+4, 1),
        'als__regularization': [1e-2]
    }

    params_combinations_refined = list(ParameterGrid(param_grid_refined))
    print(f'{len(params_combinations_refined)} combinations of hyperparameters will be tested')


    als_evaluation_refined = {}

    for i,params in enumerate(params_combinations_refined):
        train_conf = (train * params['confidence_factor']).astype('double')
        model = AlternatingLeastSquares(factors=params['als__factors'], regularization=params['als__regularization'], iterations=50)
        model.fit(train_conf, show_progress=False)

        map5 = mean_average_precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=5, num_threads=4, show_progress=False)
        map10 = mean_average_precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4, show_progress=False)
        ndcg5 = ndcg_at_k(model, train.T.tocsr(), test.T.tocsr(), K=5, num_threads=4, show_progress=False)
        ndcg10 = ndcg_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=4, show_progress=False)

        als_evaluation_refined[i] = {
            'map@5': map5,
            'map@10': map10,
            'ndcg@5': ndcg5,
            'ndcg@10': ndcg10   
        } 

    evaluation_results_refined = pd.DataFrame([
        [params_combinations_refined[i]['confidence_factor'], 
        params_combinations_refined[i]['als__factors'],
        params_combinations_refined[i]['als__regularization'], 
        als_evaluation_refined[i]['map@5'], 
        als_evaluation_refined[i]['map@10'],
        als_evaluation_refined[i]['ndcg@5'], 
        als_evaluation_refined[i]['ndcg@10']
        ] 
        for i in range(len(params_combinations_refined))
    ], columns=['confidence', 'n_factors', 'regularization', 'map@5', 'map@10', 'ndcg@5', 'ndcg@10'])

    evaluation_results_refined.to_csv('validation-results/als_hyperparameters_tuning-refined.csv', index=False)

else:
    evaluation_results_refined = pd.read_csv('validation-results/als_hyperparameters_tuning-refined.csv')

In [20]:
evaluation_results_refined.loc[evaluation_results_refined['map@5'] == evaluation_results_refined['map@5'].max()]

Unnamed: 0,confidence,n_factors,regularization,map@5,map@10,ndcg@5,ndcg@10
48,5.0,18,0.01,0.23045,0.257516,0.292607,0.349533


# Save the model

### Sparse matrix

In [21]:
filename_base = './../flask_app/nostrappdamus/model/data'

# save the matrix to disk
sparse.save_npz(f'{filename_base}/als_sparse_matrix.npz', sparse_item_user)

# save order of races used
with open(f'{filename_base}/als_hash.json', 'w') as f:
    race_hash = df_races_for_model.loc[full_set_pivot.index].reset_index()['race'].to_dict()
    f.write(json.dumps(race_hash))

In [22]:
best_params_df = pd.concat([
    evaluation_results_refined.loc[evaluation_results_refined[metric] == evaluation_results_refined[metric].max()]
        for metric in ['map@5', 'map@10', 'ndcg@5', 'ndcg@10']
])

best_params_df

Unnamed: 0,confidence,n_factors,regularization,map@5,map@10,ndcg@5,ndcg@10
48,5.0,18,0.01,0.23045,0.257516,0.292607,0.349533
25,4.5,16,0.01,0.229829,0.257877,0.291642,0.35052
48,5.0,18,0.01,0.23045,0.257516,0.292607,0.349533
28,6.0,16,0.01,0.229563,0.257642,0.29183,0.350947


In [23]:
from scipy.stats import mode

# are they any indice that gives the best result for two or more of the metrics?
best_idx = mode(best_params_df.index)[0][0]
best_params = evaluation_results_refined.loc[best_idx]
best_params

confidence         5.000000
n_factors         18.000000
regularization     0.010000
map@5              0.230450
map@10             0.257516
ndcg@5             0.292607
ndcg@10            0.349533
Name: 48, dtype: float64

In [31]:
# need to import global implicit
from implicit.als import AlternatingLeastSquares

# fit the model on all the data using those params
full_set_conf = (sparse_item_user * best_params['confidence']).astype('double')

als_final = AlternatingLeastSquares(factors=int(best_params['n_factors']), regularization=best_params['regularization'], iterations=50)
als_final.fit(full_set_conf, show_progress=False)

import pickle

# save the model to disk
pickle.dump(als_final, open(f"{filename_base}/als_model.sav", 'wb'))

mean_average_precision_at_k(als_final, train.T.tocsr(), test.T.tocsr(), K=5, num_threads=4)

# # some time later...
 
# # load the model from disk
# loaded_model = pickle.load(open(f"{filename_base}/als-model.sav", 'rb'))
# loaded_matrix = sparse.load_npz(f"{filename_base}/als_sparse_matrix.npz")
# with open(f"{filename_base}/als_race_hash.json", "r") as f:
#     hash_code_to_race = json.loads(f.read())


100%|██████████| 45799/45799 [00:03<00:00, 12984.72it/s]


0.4909675054755087

In [33]:
# compared to Cosine Recommender
cosine = CosineRecommender(K=50, num_threads=4)
cosine.fit(sparse_item_user, show_progress=False)
mean_average_precision_at_k(cosine, train.T.tocsr(), test.T.tocsr(), K=5, num_threads=4)

100%|██████████| 45799/45799 [00:01<00:00, 44277.37it/s]


0.2710724630735964

# Binarizing the race relevance

In [19]:
full_set_pivot_binarized = full_set_pivot.apply(np.sign)

sparse_item_user_binarized = sparse.csr_matrix(full_set_pivot_binarized.values)

train_binarized, test_binarized = train_test_split(sparse_item_user_binarized, train_percentage=0.8)

if runTheGridSearch:

    als_evaluation_binarized = {}


    for i,params in enumerate(params_combinations):
        train_conf = (train_binarized * params['confidence_factor']).astype('double')
        model = AlternatingLeastSquares(factors=params['als__factors'], regularization=params['als__regularization'], iterations=50)
        model.fit(train_conf, show_progress=False)

        map5 = mean_average_precision_at_k(model, train_binarized.T.tocsr(), test_binarized.T.tocsr(), K=5, num_threads=4, show_progress=False)
        map10 = mean_average_precision_at_k(model, train_binarized.T.tocsr(), test_binarized.T.tocsr(), K=10, num_threads=4, show_progress=False)
        ndcg5 = ndcg_at_k(model, train_binarized.T.tocsr(), test_binarized.T.tocsr(), K=5, num_threads=4, show_progress=False)
        ndcg10 = ndcg_at_k(model, train_binarized.T.tocsr(), test_binarized.T.tocsr(), K=10, num_threads=4, show_progress=False)

        als_evaluation_binarized[i] = {
            'map@5': map5,
            'map@10': map10,
            'ndcg@5': ndcg5,
            'ndcg@10': ndcg10   
        } 
    
    evaluation_results_binarized = pd.DataFrame([
        [params_combinations[i]['confidence_factor'], 
        params_combinations[i]['als__factors'],
        params_combinations[i]['als__regularization'], 
        als_evaluation_binarized[i]['map@5'], 
        als_evaluation_binarized[i]['map@10'],
        als_evaluation_binarized[i]['ndcg@5'], 
        als_evaluation_binarized[i]['ndcg@10']
        ] 
        for i in range(len(params_combinations))
    ], columns=['confidence', 'n_factors', 'regularization', 'map@5', 'map@10', 'ndcg@5', 'ndcg@10'])

    evaluation_results_binarized.to_csv('validation-results/als_binarized_hyperparameters_tuning.csv', index=False)
else:
    evaluation_results_binarized = pd.read_csv('validation-results/als_binarized_hyperparameters_tuning.csv')

In [20]:
evaluation_results_binarized.loc[evaluation_results['map@5'] == evaluation_results['map@5'].max()]

Unnamed: 0,confidence,n_factors,regularization,map@5,map@10,ndcg@5,ndcg@10
103,5,17,0.01,0.22544,0.252478,0.286371,0.344047
