In [1]:
%env MKL_NUM_THREADS=1

import os
import numpy as np
from scipy import sparse
from sklearn.feature_extraction import text
import implicit

from ya_cup_2022.scoring import mrr_score

env: MKL_NUM_THREADS=1


In [2]:
TRAIN_FN = 'train'
# TEST_FN = '_test_xxl'
# ANS_FN = '_ans_xxl'
TEST_FN = 'test'
ANS_FN = None

# MODEL_CLS = implicit.als.AlternatingLeastSquares
# MODEL_PARAMS = {'factors': 1000, 'alpha': 100, 'regularization': 10}
# # MODEL_FN = '_als_xxl_f1000_a100_r10'
# # PRED_FN = '_pred_xxl_als_f1000_a100_r10'
# MODEL_FN = 'als_f1000_a100_r10'
# PRED_FN = 'pred_als_f1000_a100_r10'

MODEL_CLS = implicit.bpr.BayesianPersonalizedRanking
MODEL_PARAMS = {'factors': 150, 'learning_rate': 0.01, 'regularization': 0.000001}
MODEL_FN = 'bpr_f150_l01_r000001'
PRED_FN = 'pred_bpr_f150_l01_r000001'

N = 100

WD = os.path.join('D:', 'education', 'yaintern', 'yandex_cup_2022', 'data')
WD

'D:education\\yaintern\\yandex_cup_2022\\data'

In [3]:
def vectorize(frain_file, test_file):
    vectorizer = text.CountVectorizer(lowercase=False, token_pattern=r'\b\d+\b')
    
    with open(frain_file, 'rt') as f:
        users = [line.strip() for line in f.readlines()]
    
    test_users_offset = len(users)
    
    with open(test_file, 'rt') as f:
        users.extend(line.strip() for line in f.readlines())
    
    X = vectorizer.fit_transform(users)
    return X, vectorizer, test_users_offset

In [4]:
X, vectorizer, test_offset = vectorize(
    os.path.join(WD, TRAIN_FN),
    os.path.join(WD, TEST_FN),
)

X

<1449998x483275 sparse matrix of type '<class 'numpy.int64'>'
	with 117450834 stored elements in Compressed Sparse Row format>

In [5]:
ind_2_track = dict()
for track, ind in vectorizer.vocabulary_.items():
    ind_2_track[ind] = track

In [6]:
model_path = os.path.join(WD, MODEL_FN)
if os.path.exists(model_path):
    model = MODEL_CLS.load(model_path)
else:
    model = MODEL_CLS(random_state=42, **MODEL_PARAMS)
    model.fit(X)
#     model.save(model_path)

  0%|          | 0/100 [00:00<?, ?it/s]

In [7]:
user_ids = list(range(test_offset, X.shape[0]))
recs = model.recommend(
    user_ids,
    X[test_offset:, :],
    filter_already_liked_items=True,
    N=N,
)

y_pred = [[ind_2_track[i] for i in rec] for rec in recs[0]]

In [8]:
if ANS_FN is not None:
    with open(os.path.join(WD, ANS_FN), 'rt') as f:
        y_true = [line.strip() for line in f.readlines()]
        print(mrr_score(y_true, y_pred))

In [9]:
with open(os.path.join(WD, PRED_FN), 'wt') as f:
    for tracks in y_pred:
        f.write(f'{" ".join(tracks)}\n')