#Imports and data prepairing

In [None]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.metrics import ndcg_score
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/drive')
path = "/drive/MyDrive/DS/RecSys"

!mkdir data
!cp -a /$path/. data

Mounted at /drive


In [None]:
!7z e "data/train.csv.7z"


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 106420688 bytes (102 MiB)

Extracting archive: data/train.csv.7z
--
Path = data/train.csv.7z
Type = 7z
Physical Size = 106420688
Headers Size = 122
Method = LZMA2:24
Solid = -
Blocks = 1

  0%      2% - train.csv                  4% - train.csv                  6% - train.csv                  9% - train.csv                 12% - train.csv                 14% - train.csv                 17% - train.csv                 18% - train.csv                 21% - tr

In [None]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


#Исследование Моделей

Попробуем 3 метод(описан в другом ноутбуке). Если коротко, то представим для каждого пользователя релевантные айтемы, как последовательность слов. И будем предсказывать подходящие по смыслу слова.

In [None]:
data = data[['msno', 'song_id', 'target']]
data.head()

Unnamed: 0,msno,song_id,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,1


Закодируем все Id юзеров и песен и заменим на инлексы их айдишнки:

In [None]:
data = data.dropna()

users_arr = data["msno"].nunique()
songs_arr = data["song_id"].nunique()

user_to_idx = dict(zip(np.unique(data["msno"]), list(range(users_arr))))
song_to_idx = dict(zip(np.unique(data["song_id"]), list(range(songs_arr))))
idx_to_user = dict(zip(list(range(users_arr)), np.unique(data["msno"])))
idx_to_song = dict(zip(list(range(songs_arr)), np.unique(data["song_id"])))

data["msno"] = data["msno"].apply(lambda x: user_to_idx[x])
data["song_id"] = data["song_id"].apply(lambda x: song_to_idx[x])

data.head()

Unnamed: 0,msno,song_id,target
0,8158,74679,1
1,17259,223479,1
2,17259,120758,1
3,17259,23707,1
4,8158,33308,1


Теперь получиv train и test части:

In [None]:
data_train, data_test = train_test_split(data, test_size=0.2, shuffle=True)
data_train.shape, data_test.shape

((5901934, 3), (1475484, 3))

Представим их в формате юзер: последовательность релевантных/нерелевантных айтемов.

In [None]:
data_train_relevant = data_train[data_train['target'] == 1].groupby('msno')['song_id'].apply(list).reset_index()
data_train_relevant = pd.DataFrame({'msno': data_train_relevant['msno'], 'relevant': data_train_relevant['song_id']})
data_train_relevant = data_train_relevant.sort_values(by='msno')

data_train_relevant.head()

Unnamed: 0,msno,relevant
0,0,"[312530, 109275, 205905, 295587, 174513, 81189..."
1,1,"[215276, 336313, 319161, 158168, 119518, 17625..."
2,2,"[158356, 256953, 351749, 89514, 335402, 29948,..."
3,3,[91646]
4,4,"[132449, 56825, 140667, 275289, 146647, 26036,..."


In [None]:
data_train_unrelevant = data_train[data_train['target'] == 0].groupby('msno')['song_id'].apply(list).reset_index()
data_train_unrelevant = pd.DataFrame({'msno': data_train_unrelevant['msno'], 'unrelevant': data_train_unrelevant['song_id']})
data_train_unrelevant = data_train_unrelevant.sort_values(by='msno')

data_train_unrelevant.head()

Unnamed: 0,msno,unrelevant
0,0,"[206365, 349588, 357716, 294389, 249803, 10554..."
1,1,"[113756, 261188, 351616, 224523, 222716, 19963..."
2,2,"[213167, 231584, 12998, 154990, 230572, 283231..."
3,3,"[6483, 168333, 255003, 167303, 147886, 73313, ..."
4,4,"[44174, 182691, 359654, 195103, 95767, 245192,..."


Также подготовим в test треки с положительным взаимодействием:

In [None]:
data_test = data_test[data_test['target'] == 1].groupby('msno')['song_id'].apply(list).reset_index()
data_test = pd.DataFrame({'msno': data_test['msno'], 'relevant': data_test['song_id']})
data_test = data_test.sort_values(by='msno')

data_test.head()

Unnamed: 0,msno,relevant
0,0,"[130636, 175398, 359499, 269359, 339311, 32455..."
1,1,"[81614, 326162, 83292, 5174, 312782, 50398, 25..."
2,2,"[29697, 217320, 192060, 8218, 106432, 227529, ..."
3,4,"[6504, 225377, 5627, 339623, 313171, 332099, 8..."
4,6,"[30692, 154323, 194004, 222795, 316739, 146487..."


Теперь создадим массив с положительными и отрицательными взаимодействиями:

In [None]:
song_groups = list(data_train_relevant['relevant']) + list(data_train_unrelevant['unrelevant'])
len(song_groups)

56896

Загрузим модель. Вначале попробуем подавать позитивные и негативные сэмплы.

In [None]:
from gensim.models import Word2Vec
import datetime
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

max_window = 99999999

start = datetime.datetime.now()
model = Word2Vec(song_groups, epochs=5, window=max_window, sg=1, min_count=10, hs=0, negative=5)
print("Time passed: " + str(datetime.datetime.now()-start))

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 1103837 words, keeping 119045 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 2204439 words, keeping 171840 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 3287311 words, keeping 218805 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 4240874 words, keeping 264358 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 5220200 words, keeping 301808 word types
INFO:gensim.models.word2vec:collected 324602 word types from a corpus of 5901934 raw words and 56896 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 53230 unique words (16.40% of original 324602

Time passed: 2:22:44.904501


Сохраним обученную модель:

In [None]:
model.save('item2vec_pos_and_neg_samples.model')

INFO:gensim.utils:Word2Vec lifecycle event {'fname_or_handle': 'item2vec_pos_and_neg_samples.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-05-01T12:48:17.403335', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.1.58+-x86_64-with-glibc2.35', 'event': 'saving'}
INFO:gensim.utils:not storing attribute cum_table
INFO:gensim.utils:saved item2vec_pos_and_neg_samples.model


Определим метрику:

In [None]:
def calc_ndcg(recomendations, relevant, k=20):
    if len(recomendations) < len(relevant):
        recomendations = np.concatenate((recomendations, np.zeros(len(relevant) - len(recomendations))))
    if len(relevant) < len(recomendations):
        relevant = np.concatenate((relevant, np.zeros(len(recomendations) - len(relevant))))

    return ndcg_score([relevant], [recomendations], k = 20)

Построим рекомендации. Для этого возьмем каждый релевантный для пользователя трек и вернем похожие на него треки.

In [None]:
def recommend_for_user(user_idx, model_emb, train, N_neighb=20):
    if not(user_idx in train['msno'].values):
        return np.asarray([])

    train_relevant = train[train['msno'] == user_idx]['relevant']
    if train_relevant.empty:
        return np.asarray([])

    recommendations = {}
    for song_idx in train_relevant.iloc[0]:
        if song_idx in model.wv.key_to_index.keys():
            nearest_for_song = model_emb.wv.most_similar(positive=[song_idx], topn=20)
            for ind, dist in nearest_for_song:
                if ind not in recommendations or recommendations[ind] > dist:
                    recommendations[ind] = dist

    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1])
    top_recommendations = dict(sorted_recommendations[:N_neighb])

    return np.asarray(list(top_recommendations.keys()))

In [None]:
def predict_for_all_users(users, train, model_emb):
    ncdg_values = np.asarray([])

    for index, row in users.iterrows():
        user_idx = int(row['msno'])
        recomendations = recommend_for_user(user_idx, model_emb, train)
        relevant = np.asarray(row['relevant'])

        if (len(relevant) != 0) and (len(recomendations) != 0):
            ndcg_user = calc_ndcg(recomendations, relevant, 20)
            ncdg_values = np.concatenate((ncdg_values, np.asarray([ndcg_user])))

    return ncdg_values

Посмотрим на результат. Так как подсчет на всез пользователях будет занимать примерно 5 часов на T4 GPU, то возьмем случаную выборку и протестируем на ней:

In [None]:
ncdg_values = predict_for_all_users(data_test.sample(2400), data_train_relevant, model)

print('max ndcg@20 score: ', ncdg_values.max())
print('min ndcg@20 score: ', ncdg_values.min())
print('mean ndcg@20 score: ', ncdg_values.sum()/len(ncdg_values))

max ndcg@20 score:  1.0
min ndcg@20 score:  0.2276702486969526
mean ndcg@20 score:  0.5935382149986984


Это лишний раз подтверждает предположние, что допольнительная информация о треках и пользователях помогает значительно увеличить метрику NDCG@20. Так как в этом случае скор сопоставимый с остальными методами, которые используют только информацию положительных и негативных взаимодействий.

также приложу таблицу сравнений усредненных по юзерам значений NDCG@20(остальные модели во втором ноутбуке):


| model_name    | XGBRanker | kNN (cosine) | kNN (euclidean) | Implicit ALS | Word2Vec + cossim |
|---------------|-----------|--------------|-----------------|--------------|-------------------|
| NDCG@20 score | 0.8203    | 0.6157       | 0.6545          | 0.5715       | 0.5935            |