In [1]:
import os
from collections import defaultdict

import pandas as pd
from sklearn.model_selection import train_test_split

from catboost import CatBoostRanker, Pool

from ya_cup_2022.df import CSVDataFrameHandler, FMPSampler
from ya_cup_2022.df.dto import GeneratorOptions

In [2]:
ARTISTS_FN = 'track_artists.csv'
TRAIN_FN = '_train_300k'
FMP_WD = 'xxl'
CUTOFF = 200
MAX_SKIP = 3
N_PREV_LIKES = 10
N_FAV_ARTISTS = 5
SAMPLE_SIZE = 200
EXTENDED = True

MAX_USERS = 10000
MAX_ROWS = MAX_USERS * SAMPLE_SIZE
MAX_CHUNKS = 1

EXT_PREFIX = 'ext' if EXTENDED else ''
DF_FN = (
    f'df_'
    f'{TRAIN_FN}_'
    f'{FMP_WD}_'
    f'{CUTOFF}_'
    f'{MAX_SKIP}_'
    f'{EXT_PREFIX}_'
    f'{N_PREV_LIKES}_'
    f'{N_FAV_ARTISTS}_'
    f'{SAMPLE_SIZE}'
)

MODEL_FN = f'catboost_rnk__{DF_FN}'
DF_FN += '.csv'

WD = os.path.join('D:', 'education', 'yaintern', 'yandex_cup_2022', 'data')
WD

'D:education\\yaintern\\yandex_cup_2022\\data'

In [3]:
sampler = FMPSampler(
    os.path.join(WD, 'fm', FMP_WD),
    cutoff=CUTOFF,
    max_skip=MAX_SKIP,
    extended=EXTENDED,
)

df = CSVDataFrameHandler(
    filename=os.path.join(WD, DF_FN),
    options=GeneratorOptions(
        n_prev_likes=N_PREV_LIKES,
        n_fav_artists=N_FAV_ARTISTS,
        sample_size=SAMPLE_SIZE),
    add_features=sampler.add_features,
    converter=os.path.join(WD, ARTISTS_FN),
).load_chunked(MAX_ROWS, max_chunks=MAX_CHUNKS)

df.head()

Loading tracks/artists from file D:education\yaintern\yandex_cup_2022\data\track_artists.csv
Total # of tracks: 483275
Total # of artists: 56134


Unnamed: 0,user_id,prev_like_0,prev_like_1,prev_like_2,prev_like_3,prev_like_4,prev_like_5,prev_like_6,prev_like_7,prev_like_8,...,cur_track_id,cur_artist_id,sampler_rank,f,f_0,f_1,f_2,f_3,like,rank
0,0,466482,98655,405936,178706,218582,67565,199956,208051,40199,...,452615,15780,1.0,0.012936,0.012936,0.0,0.0,0.0,0,0.995
1,0,466482,98655,405936,178706,218582,67565,199956,208051,40199,...,128093,3471,0.995,0.010321,0.002156,0.0059,0.001027,0.001238,0,0.99
2,0,466482,98655,405936,178706,218582,67565,199956,208051,40199,...,447786,15780,0.99,0.010192,0.010192,0.0,0.0,0.0,0,0.985
3,0,466482,98655,405936,178706,218582,67565,199956,208051,40199,...,44094,36803,0.985,0.005441,0.002156,0.002771,0.000514,0.0,0,0.98
4,0,466482,98655,405936,178706,218582,67565,199956,208051,40199,...,369944,36803,0.98,0.005193,0.00294,0.00152,0.000321,0.000413,0,0.975


In [4]:
cat_features = list()
for i, t in enumerate(df.dtypes):
    if isinstance(t, pd.api.types.CategoricalDtype):
        cat_features.append(df.columns[i])
print(cat_features)

['prev_like_0', 'prev_like_1', 'prev_like_2', 'prev_like_3', 'prev_like_4', 'prev_like_5', 'prev_like_6', 'prev_like_7', 'prev_like_8', 'prev_like_9', 'prev_artist_0', 'prev_artist_1', 'prev_artist_2', 'prev_artist_3', 'prev_artist_4', 'prev_artist_5', 'prev_artist_6', 'prev_artist_7', 'prev_artist_8', 'prev_artist_9', 'fav_artist_0', 'fav_artist_1', 'fav_artist_2', 'fav_artist_3', 'fav_artist_4', 'cur_track_id', 'cur_artist_id']


In [8]:
for col in filter(lambda c: c.startswith('fav_artist_'), cat_features):
    aid = col.split('_')[-1]
    df[f'fav_artist_match_{aid}'] = (df['cur_artist_id'] == df[col]).astype(int)

0          0
1          0
2          0
3          0
4          0
          ..
1865608    0
1865609    0
1865610    0
1865611    0
1865612    0
Name: fav_artist_match_0, Length: 1865613, dtype: int32

In [6]:
def create_pool(df, users, cat_features):
    X = df[df['user_id'].isin(users)].reset_index(drop=True)
    
    pairs = list()
    X['i'] = X.index.values
    likes_ids = X[X['like'] == 1].index.values
    tracks_ids = X[X['like'] != 1].groupby('user_id').agg({'i': list})['i'].values
    for i, js in zip(likes_ids, tracks_ids):
        for j in js:
            pairs.append((i, j))
    
    return Pool(
        data=X.drop(['user_id', 'like', 'i'], axis=1),
        label=X['like'],
        group_id=X['user_id'],
        cat_features=cat_features,
        pairs=pairs,
    )

In [7]:
unique_users = list(df['user_id'].unique())
u_train, u_valid = train_test_split(
    unique_users, 
    test_size=0.2, 
    random_state=42,
)

train_data = create_pool(df, u_train, cat_features)
valid_data = create_pool(df, u_valid, cat_features)

KeyboardInterrupt: 

In [None]:
model = CatBoostRanker(
    loss_function='PairLogitPairwise',
    custom_metric='MRR:top=100',
    eval_metric='MRR:top=100',
    metric_period=15,
    per_float_feature_quantization=[
        '32:border_count=1024',
        '33:border_count=1024',
        '34:border_count=1024',
        '35:border_count=1024',
        '36:border_count=1024',
    ],
    early_stopping_rounds=100,
    use_best_model=True,
    random_state=42, 
    task_type='GPU', 
    devices='0',
)

model.fit(
    train_data, 
    eval_set=valid_data,
    plot=False,
)