In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import requests
from more_itertools import pairwise
from lightfm import LightFM
import os

os.environ['MKL_NUM_THREADS'] = '1'



In [2]:
#загрузка датасетов
b_games = pd.read_feather('../datasets/bgg_boardgames_top_2000.feather')
ratings = pd.read_feather('../datasets/bgg_ratings_top_2000.feather')

===== Board Games =====
====================

boardgame_id - board game id <br>
title - title of board game <br>
year_published - year of publication of the board game <br>
minplayers - minimum number of players per game <br>
maxplayers - maximum number of players per game <br>
minplaytime - minimum playing time per game <br>
maxplaytime - maximum playing time per game <br>
age - lower age limit for playing <br>
users_rated - number of users who rated a game <br>
average_rating - average rating <br>
bayes_average_rating - bayes average rating <br>
median - median rating <br>
stddev - standard deviation of rating <br>
owned - number of users who have a game <br>
trading - number of users who selling a game <br>
wishing - number of users who want to get a game <br>
num_of_comments - number of comments <br>
num_of_weights - number of scores for weight <br>
average_weight - average weight of game <br>
ranks - game ranks <br>
main_publisher - main publisher <br>
description - description of the game <br>
publishers - all publishers <br>
honors - all honors <br>
expansions - all expansions <br>
accessories - all accessories <br>
artists - all artists <br>
mechanics - used mechanics <br>
category - category ща пфьуы <br>
designers - all designers <br>
graphic_designers - all graphic designers <br>
subdomains - subdomains of categories <br>
implementations - all implementations <br>
suggested_numplayers - proposed number of players <br>
podcast_episodes - all podcast episodes <br>
comments - some comments <br>
marketplace_history - marketplace history <br>
thumbnail_link - thumbnail link <br>
image_link - image link <br>

===== Ratings =====
================

nickname - user's nickname <br>
title - title of the game <br>
boardgame_id - id of the board game <br>
rating - given rating <br>
num_of_plays - number of games played by the user <br>
comment - comment fot rating <br>
own - flag whether the user owns the game <br>
prevowned - flag of whether the user has previously owned the game <br>
fortrade - flag whether the user is selling the game <br>
want - flag whether the user want the game <br>
wanttoplay - flag whether the user want to play the game <br>
wanttobuy - flag whether the user want to buy the game <br>
wishlist - flag whether the user have the game in wishlist <br>
preordered - flag whether the user preordered the game <br>
last_modified - date of last modification <br>

In [3]:
#убираем столбцы с недостаточным количеством значений и которые не хотим использовать в обучении
b_games_modificated = b_games.drop(['graphic_designers', 
                                  'podcast_episodes', 
                                  'comments', 
                                  'marketplace_history', 
                                  'thumbnail_link', 
                                  'image_link',
                                  'title',
                                  'ranks',
                                  'description',
                                  'honors',
                                  'expansions', 
                                  'accessories', 
                                  'artists', 
                                  'designers', 
                                  'subdomains', 
                                  'implementations', 
                                  'suggested_numplayers',
                                  'category',
                                  'mechanics',
                                  'publishers',
                                  'main_publisher'],
                                  axis = 1)

In [4]:
#некоторые столбцы с численным типом скорее относятся к категориальному, поэтому преобразуем их в тип object
b_games_modificated.loc[:,['boardgame_id', 'year_published', 'minplayers', 'minplaytime', 'maxplaytime', 'age']] = \
b_games_modificated.loc[:,['boardgame_id', 'year_published', 'minplayers', 'minplaytime', 'maxplaytime', 'age']].apply(lambda x: x.astype('object'))


#преобразуем столбец с датой в формат datetime
ratings['last_modified'] = pd.to_datetime(ratings['last_modified'])

In [5]:
#выделяем вещественные и категориальные фичи

ratings.set_index(['nickname', 'boardgame_id'], inplace=True)
b_games_modificated.set_index('boardgame_id', inplace=True)

numeric_columns_rate = ratings.select_dtypes(exclude='object').columns
categorical_columns_rate = ratings.select_dtypes(include='object').columns

numeric_columns_board = b_games_modificated.select_dtypes(exclude='object').columns
categorical_columns_board = b_games_modificated.select_dtypes(include='object').columns

  return Index(sequences[0], name=names)


In [6]:
#заменим пропуски в вещественных признаках медианой, а в категориальных - самым популярным классом


for col in numeric_columns_rate:
    ratings[col] = ratings[col].fillna(ratings[col].median())
for col in categorical_columns_rate:
    ratings[col] = ratings[col].fillna(ratings[col].mode().iloc[0])

    
for col in numeric_columns_board:
    b_games_modificated[col] = b_games_modificated[col].fillna(b_games_modificated[col].median())
for col in categorical_columns_board:
    b_games_modificated[col] = b_games_modificated[col].fillna(b_games_modificated[col].mode().iloc[0])

In [7]:
#меняем название столбцов для упрощения вычислений
ratings.reset_index(inplace=True)
ratings.rename(columns={'nickname': 'user_id','boardgame_id':'item_id'}, inplace=True)

In [8]:
#Для наших данных выбрем 60 последних дней и будем тестировать на них последовательно (1 test fold - 1 день).

class TimeRangeSplit():

    def __init__(self, 
                 start_date, 
                 end_date=None, 
                 freq='D', 
                 periods=None, 
                 tz=None, 
                 normalize=False, 
                 inclusive='both', 
                 train_min_date=None,
                 filter_cold_users=True, 
                 filter_cold_items=True, 
                 filter_already_seen=True):
        
        self.start_date = start_date
        if end_date is None and periods is None:
            raise ValueError("Either 'end_date' or 'periods' must be non-zero, not both at the same time.")

        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.tz = tz
        self.normalize = normalize
        self.inclusive = inclusive
        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen

        self.date_range = pd.date_range(
            start=start_date, 
            end=end_date, 
            freq=freq, 
            periods=periods, 
            tz=tz, 
            normalize=normalize, 
            inclusive=inclusive)

        self.max_n_splits = max(0, len(self.date_range) - 1)
        if self.max_n_splits == 0:
            raise ValueError("Provided parametrs set an empty date range.") 

    def split(self, 
              df, 
              user_column='user_id',
              item_column='item_id',
              datetime_column='date',
              fold_stats=False):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        for start, end in pairwise(date_range):
            fold_info = {
                'Start date': start,
                'End date': end
            }
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            if fold_stats:
                fold_info['Train'] = len(train_idx)

            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]
            
            if self.filter_cold_users:
                new = np.setdiff1d(
                    df.loc[test_idx, user_column].unique(), 
                    df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New users'] = len(new)
                    fold_info['New users interactions'] = len(new_idx)

            if self.filter_cold_items:
                new = np.setdiff1d(
                    df.loc[test_idx, item_column].unique(), 
                    df.loc[train_idx, item_column].unique())
                new_idx = df.index[test_mask & df[item_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New items'] = len(new)
                    fold_info['New items interactions'] = len(new_idx)

            if self.filter_already_seen:
                user_item = [user_column, item_column]
                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index
                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index
                intersection = train_pairs.intersection(test_pairs)
                test_idx = test_idx[~test_pairs.isin(intersection)]
                # test_mask = rd.df.index.isin(test_idx)
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)

            if fold_stats:
                fold_info['Test'] = len(test_idx)

            yield (train_idx, test_idx, fold_info)

    def get_n_splits(self, df, datetime_column='date'):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            df_datetime = df_datetime[df_datetime >= self.train_min_date]

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        return max(0, len(date_range) - 1)

In [9]:
last_date = ratings['last_modified'].max().normalize() 
folds = 60
start_date = last_date - pd.Timedelta(days=folds)
start_date, last_date

(Timestamp('2022-12-08 00:00:00'), Timestamp('2023-02-06 00:00:00'))

In [10]:
cv = TimeRangeSplit(start_date=start_date, periods=folds+1)

cv.max_n_splits, cv.get_n_splits(ratings, datetime_column='last_modified')

(60, 60)

In [11]:
folds_with_stats = list(cv.split(
    ratings, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='last_modified',
    fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

In [12]:
fold_dates = [(info['Start date'], info['End date']) for _, _, info in folds_with_stats]

In [13]:
#разбиваем на тест и трейн
train_idx, test_idx, info = folds_with_stats[0]

train = ratings.loc[train_idx]
test = ratings.loc[test_idx]
train.shape, test.shape


((988378, 15), (469, 15))

In [14]:
#делаем словарь "user-порядковый номер"
users_inv_mapping = dict(enumerate(ratings['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

#такой же словарь "item-порядковый номер айтема"
items_inv_mapping = dict(enumerate(b_games['boardgame_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

item_titles = pd.Series(b_games['title'].values, index=b_games['boardgame_id']).to_dict()

In [15]:
b_games_modificated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 77423 to 91
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year_published        1999 non-null   int64  
 1   minplayers            1999 non-null   int64  
 2   maxplayers            1999 non-null   Int16  
 3   minplaytime           1999 non-null   int64  
 4   maxplaytime           1999 non-null   int64  
 5   age                   1999 non-null   int64  
 6   users_rated           1999 non-null   Int32  
 7   average_rating        1999 non-null   float32
 8   bayes_average_rating  1999 non-null   float32
 9   median                1999 non-null   float32
 10  stddev                1999 non-null   float32
 11  owned                 1999 non-null   Int32  
 12  trading               1999 non-null   Int16  
 13  wishing               1999 non-null   Int16  
 14  num_of_comments       1999 non-null   Int16  
 15  num_of_weights     

In [16]:
b_games_modificated = b_games_modificated.loc[:, [
                                                 'year_published',
                                                 'minplayers',
                                                 'maxplayers',
                                                 'minplaytime',
                                                 'maxplaytime',
                                                 'age',
                                                 'users_rated',
                                                 'average_rating',
                                                 'bayes_average_rating',]]
b_games_modificated['users_rated'] = b_games_modificated['users_rated'].astype('float16')

b_games_modificated = pd.get_dummies(b_games_modificated, drop_first=True, columns=['year_published',
                                                 'minplayers',
                                                 'maxplayers',
                                                 'minplaytime',
                                                 'maxplaytime',
                                                 'age',])

In [17]:
b_games_modificated.reset_index(inplace=True)

In [18]:
#делаем разреженную матрицу с нашими items
b_games_matrix = csr_matrix(b_games_modificated.drop('boardgame_id', axis=1).values)
b_games_matrix

<1999x196 sparse matrix of type '<class 'numpy.float32'>'
	with 17365 stored elements in Compressed Sparse Row format>

In [19]:
#делаем матрицу user-item и NaN заменим на нули
user_interaction_train = pd.pivot_table(train, index='user_id', columns='item_id', values='rating').fillna(0)
user_interaction_test = pd.pivot_table(test, index='user_id', columns='item_id', values='rating').fillna(0)

In [20]:
# проеобразуем в csr matrix нашу матрицу взаимодействий пользователя
train = csr_matrix(user_interaction_train.values)
test = csr_matrix(user_interaction_test.values)


In [21]:

model = LightFM(loss='warp',
                random_state=2016)

model = model.fit(train,
                  epochs=10,
                  verbose=False)


In [22]:
def generate_lightfm_recs_mapper(model, item_ids, N, user_mapping, item_inv_mapping, num_threads=4):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, num_threads=num_threads)

        top_cols = np.argpartition(recs, -np.arange(N))[-N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]

        return final_recs[:N]
    return _recs_mapper

In [23]:
top_N=5
all_cols = list(items_mapping.values())

mapper = generate_lightfm_recs_mapper(
    model, 
    item_ids=all_cols, 
    N=top_N,
    user_mapping=users_mapping,
    item_inv_mapping=items_inv_mapping,
    num_threads=4
)

In [24]:
recs = pd.DataFrame({'user_id': ratings['user_id'].unique()})
recs['item_id'] = recs['user_id'].map(mapper)
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1

In [25]:
recs

Unnamed: 0,user_id,item_id,rank
0,happyjosiah,194594,1
0,happyjosiah,165986,2
0,happyjosiah,57390,3
0,happyjosiah,229491,4
0,happyjosiah,37111,5
...,...,...,...
1631,Michiel,351913,1
1631,Michiel,314491,2
1631,Michiel,192074,3
1631,Michiel,126100,4


In [26]:
#функция, которая считает метрики
def compute_metrics(df_true, df_pred, top_N):
    result = {}
    test_recs = df_true.set_index(['user_id', 'item_id']).join(df_pred.set_index(['user_id', 'item_id']))
    test_recs = test_recs.sort_values(by=['user_id', 'rank'])

    test_recs['users_item_count'] = test_recs.groupby(level='user_id')['rank'].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
    
    users_count = test_recs.index.get_level_values('user_id').nunique()
    for k in range(1, top_N + 1):
        hit_k = f'hit@{k}'
        test_recs[hit_k] = test_recs['rank'] <= k
        result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
        result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count

    result[f'MAP@{top_N}'] = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
    result[f'MRR'] = test_recs.groupby(level='user_id')['reciprocal_rank'].max().mean()
    return pd.Series(result)

In [27]:
epochs = [i for i in range(1, 22, 5)]

In [28]:
validation_results = pd.DataFrame()
top_N=5
for train_idx, test_idx, info in folds_with_stats:
    #print(f"test range - from {info['Start date']} to {info['End date']}")
    train = ratings.loc[train_idx]
    test = ratings.loc[test_idx]
    #print(f'train shape - {train.shape}, test shape - {test.shape}')
    
    train_mat = csr_matrix(pd.pivot_table(train, index='user_id', columns='item_id', values='rating').fillna(0).values)
    test_mat = csr_matrix(pd.pivot_table(test, index='user_id', columns='item_id', values='rating').fillna(0).values)

    for epoch in epochs:
        
        model = LightFM(loss='warp')

        model = model.fit(train_mat,
                          epochs=epoch,
                          verbose=False)

        mapper = generate_lightfm_recs_mapper(model, item_ids=all_cols, N=top_N, user_mapping=users_mapping, item_inv_mapping=items_inv_mapping)
        
        recs = pd.DataFrame({
                    'user_id': test['user_id'].unique()
                })
        recs['item_id'] = recs['user_id'].map(mapper)
        recs = recs.explode('item_id')
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        metrics = compute_metrics(test, recs, top_N)

        fold_result = pd.Series(metrics)
        fold_result.at['Date'] = info['Start date']
        fold_result.at['argv'] = f'epochs={epoch}'
        fold_result.at['Model'] = model.__class__.__name__

        validation_results = pd.concat([validation_results, fold_result], axis = 1, ignore_index=True)

In [31]:
validation_results.T.groupby(['Model', 'argv']).agg({
    'MRR': ['mean', 'std', 'min', 'max'],
    'MAP@5': ['mean', 'std', 'min', 'max'],
    'Recall@5': ['mean', 'std', 'min', 'max'],
})

Unnamed: 0_level_0,Unnamed: 1_level_0,MRR,MRR,MRR,MRR,MAP@5,MAP@5,MAP@5,MAP@5,Recall@5,Recall@5,Recall@5,Recall@5
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,min,max,mean,std,min,max,mean,std,min,max
Model,argv,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
LightFM,epochs=1,0.001702,0.00349,0.0,0.020833,0.000744,0.002784,0.0,0.020833,0.002578,0.010929,0.0,0.083333
LightFM,epochs=11,0.002858,0.004419,0.0,0.017014,0.001487,0.002799,0.0,0.012887,0.003504,0.006532,0.0,0.038462
LightFM,epochs=16,0.002478,0.004273,0.0,0.015957,0.001147,0.002731,0.0,0.012887,0.002867,0.006379,0.0,0.038462
LightFM,epochs=21,0.003635,0.005532,0.0,0.020619,0.001509,0.003149,0.0,0.015464,0.003271,0.006681,0.0,0.038462
LightFM,epochs=6,0.002842,0.00476,0.0,0.020492,0.001073,0.002183,0.0,0.008772,0.002728,0.005234,0.0,0.023256
