In [12]:
import os
import numpy as np 
import pandas as pd 
from itertools import islice, cycle
import os

In [13]:
files = os.listdir('./data')
names = [file[:-4] for file in files]
for name in names:
    locals()['{}'.format(name)] = pd.read_csv('./data/{}.csv'.format(name))

In [14]:
class PopularRecommender():
    def __init__(self, max_K=100, days=30, item_column='item_id', dt_column='date'):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        
    def fit(self, df):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column] \
            .value_counts().head(self.max_K).index.values
    
    def recommend(self, users=None, N=10):
        recs = self.recommendations[:N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

In [15]:
# преобразуем даты в datetime формат
interactions['start_date'] = pd.to_datetime(interactions['start_date'])

In [16]:
# обучим модель
pop = PopularRecommender(days=25, dt_column='start_date')
pop.fit(interactions)

In [17]:
# создадим датафрейм с user id из submission
pred_pop = pd.DataFrame({'user_id': sample_submission['Id'].unique()})

# сделаем предикт моделью популярного сразу для всех id
pred_pop['item_id'] = pop.recommend(pred_pop['user_id'], N=10)
pred_pop.head()

Unnamed: 0,user_id,item_id
0,10001,"[283713, 184549, 276903, 168963, 357309, 14317..."
1,10002,"[283713, 184549, 276903, 168963, 357309, 14317..."
2,100152,"[283713, 184549, 276903, 168963, 357309, 14317..."
3,100197,"[283713, 184549, 276903, 168963, 357309, 14317..."
4,100284,"[283713, 184549, 276903, 168963, 357309, 14317..."


In [18]:
# развернем рекомендации в удобный вертикальный формат и проставим ранк
# ранк = релевантность рекомендации для пользователя
pred_pop = pred_pop.explode('item_id')
pred_pop.head()

Unnamed: 0,user_id,item_id
0,10001,283713
0,10001,184549
0,10001,276903
0,10001,168963
0,10001,357309


In [19]:
pred_pop['rank'] = pred_pop.groupby('user_id').cumcount() + 1
pred_pop.head()

Unnamed: 0,user_id,item_id,rank
0,10001,283713,1
0,10001,184549,2
0,10001,276903,3
0,10001,168963,4
0,10001,357309,5


In [20]:
def make_submission(df, id_col='user_id', predicted_col='item_id'):
    
    df_copy = df.copy()
    df_copy.loc[:, id_col] = df_copy[id_col].astype(str)
    df_copy.loc[:, predicted_col] = df_copy[predicted_col].astype(str)
    df_copy = df_copy.groupby(id_col, as_index=False).agg({predicted_col: lambda x: ' '.join(list(x))})
    df_copy.rename(columns={id_col: 'Id', predicted_col: 'Predicted'}, inplace=True)
    
    return df_copy[['Id', 'Predicted']]

In [21]:
submission_pop = make_submission(pred_pop)
submission_pop.head()

Unnamed: 0,Id,Predicted
0,10001,283713 184549 276903 168963 357309 143175 3852...
1,10002,283713 184549 276903 168963 357309 143175 3852...
2,100152,283713 184549 276903 168963 357309 143175 3852...
3,100197,283713 184549 276903 168963 357309 143175 3852...
4,100284,283713 184549 276903 168963 357309 143175 3852...


In [22]:
submission_pop.to_csv('submission_pop.csv', index=False)