In [1]:
from tqdm import tqdm
import pandas as pd

In [2]:
# train упорядочен хронологически
train = pd.read_parquet('data/train.parquet.gzip')
train.dtypes

user_id      int32
item_id      int32
timespent     int8
reaction      int8
dtype: object

In [3]:
# timespent: время залипания юзера на айтем в минутах (от 0 до 60)
# reaction: (1) - лайк, (-1) - дизлайк
train

Unnamed: 0,user_id,item_id,timespent,reaction
0,707536,67950,0,0
1,707536,151002,0,0
2,707536,134736,0,0
3,707536,196151,0,0
4,707536,94182,0,0
...,...,...,...,...
144440010,849764,80910,0,0
144440011,993316,132328,0,0
144440012,993316,186701,0,0
144440013,666981,81857,0,0


In [6]:
train.shape

(144440015, 4)

In [4]:
train.user_id.value_counts()

675423    1018
682064     979
675869     958
101627     955
745655     954
          ... 
253866       7
952907       7
523767       7
377770       6
529279       6
Name: user_id, Length: 1000183, dtype: int64

In [5]:
# в items_meta для каждого item_id его автор и эмбеддинг содержания
items_meta = pd.read_parquet(f'data/items_meta.parquet.gzip')
items_meta

Unnamed: 0,item_id,source_id,embeddings
0,0,7340,"[0.10458118, 0.047880154, 0.030944156, -0.0351..."
1,1,6284,"[0.035625108, -0.039264094, -0.03310334, -0.04..."
2,2,12766,"[0.08418761, 0.006732465, -0.0037112322, -0.02..."
3,3,14734,"[0.049901545, 0.039079394, -0.03890682, -0.053..."
4,4,22557,"[0.09303163, 0.023448057, 0.0029488814, -0.017..."
...,...,...,...
227601,227601,19043,"[0.06742832, -0.08209568, -0.04407321, 0.00838..."
227602,227602,9384,"[0.07055114, -0.007334651, -0.0032477665, 0.00..."
227603,227603,24152,"[0.13771634, 0.023559753, 0.012204557, -0.0361..."
227604,227604,20249,"[0.04954276, -0.00674311, -0.040121585, -0.024..."


In [6]:
# candidates содержит item_id свежих кандидатов из которых нужно будет предсказать на тесте
candidates_df = pd.read_parquet(f'data/fresh_candidates.parquet.gzip')
candidates_df

Unnamed: 0,item_id
0,0
1,2
2,5
3,6
4,7
...,...
99995,227588
99996,227591
99997,227602
99998,227603


In [7]:
test = pd.read_parquet('data/test.parquet.gzip')
test

Unnamed: 0,user_id
0,7
1,8
2,9
3,11
4,18
...,...
199995,1000160
199996,1000165
199997,1000166
199998,1000168


In [8]:
class Baseline:
    def __init__(self, n_candidates):
        self.n_candidates = n_candidates
    
    def fit(self, interactions, candidates):
        popularity = interactions.groupby('item_id')['timespent'].mean().values
        candidates_popularity = popularity[candidates]
        most_popular_idxs = (-candidates_popularity).argsort()[:self.n_candidates]
        self.impersonal_prediction = candidates[most_popular_idxs]
        
    def predict(self, user_id):
        return self.impersonal_prediction

In [9]:
predictions = []
baseline = Baseline(20)
baseline.fit(train, candidates_df.item_id.values)
for user_id in tqdm(test.user_id):
    predictions.append(baseline.predict(user_id))

100%|██████████████████████████████| 200000/200000 [00:00<00:00, 2292633.39it/s]


In [10]:
test['predictions'] = predictions
test

Unnamed: 0,user_id,predictions
0,7,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
1,8,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
2,9,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
3,11,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
4,18,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
...,...,...
199995,1000160,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
199996,1000165,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
199997,1000166,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."
199998,1000168,"[4628, 103927, 146586, 18584, 75560, 44269, 58..."


In [11]:
test.to_parquet('data/sample_submission.parquet.gzip', compression='gzip')