# Imports

In [135]:
from datetime import timedelta
from itertools import product
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from rectools import Columns
from rectools.models import PopularModel, PopularInCategoryModel
from rectools.metrics import calc_metrics, MAP
from rectools.dataset import Dataset, Interactions
from rectools.model_selection import TimeRangeSplitter
from models.popular_among_n_unique import PopularAmongNUniqueUsers

# Loading data

In [2]:
interactions = pd.read_csv("data/interactions.csv")
items = pd.read_csv("data/items.csv")
users = pd.read_csv("data/users.csv") 

In [3]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [4]:
sample_submission

Unnamed: 0,user_id,item_id
0,3,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
1,11,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
2,29,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
3,30,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
4,33,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
...,...,...
193108,1097527,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
193109,1097537,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
193110,1097538,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
193111,1097544,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."


# Data preprocessing

In [5]:
interactions.rename(
    columns={
        'track_id' : Columns.Item,
        'last_watch_dt' : Columns.Datetime,
        'total_dur' : Columns.Weight
    },
    inplace=True
)

In [6]:
_, bins = pd.qcut(items["release_year"], 10, retbins=True)
labels = bins[:-1]

year_feature = pd.DataFrame(
    {
        Columns.Item: items[Columns.Item],
        "value": pd.cut(items["release_year"], bins=bins, labels=bins[:-1]),
        "feature": "release_year",
    }
)
year_feature.head()

Unnamed: 0,item_id,value,feature
0,10711,1983.0,release_year
1,2508,2012.0,release_year
2,10716,2009.0,release_year
3,7868,2014.0,release_year
4,16268,1897.0,release_year


In [7]:
items['genre'] = items['genres'].str.split(",")

In [8]:
genre_feature = items[[Columns.Item, "genre"]].explode("genre")
genre_feature.columns = [Columns.Item, "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,item_id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [9]:
item_feat = pd.concat([genre_feature, year_feature])
item_feat = item_feat[item_feat[Columns.Item].isin(interactions[Columns.Item])]

In [10]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=item_feat,
    cat_item_features=['genre', 'release_year']
)

In [37]:
n_folds = 3
unit = "D"
n_units = 7
# last_date = interactions[Columns.Datetime].max().normalize()
last_date = pd.to_datetime('15-08-2021', format='%d-%m-%Y')


start_date = last_date - pd.Timedelta(n_folds * n_units - 1, unit=unit)
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(f"""
    start_date: {start_date}
    last_date: {last_date}
    periods: {periods}
    freq: {freq}
""")

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)


    start_date: 2021-07-26 00:00:00
    last_date: 2021-08-15 00:00:00
    periods: 4
    freq: 7D



In [120]:
splitter = TimeRangeSplitter(date_range=date_range)
interactions_ = Interactions(interactions)

# Hypeparameter tuning and fitting simple Popular model

In [121]:
model_parameters = {
    "popularity": ["n_users", "n_interactions", "sum_weight", "mean_weight"], 
    "period" : [None] + [i for i in range(1, 22)]
}

In [122]:
parameter_grid = tuple(product(*list(model_parameters.values())))

In [123]:
folds = splitter.split(interactions_)

metrics = {
    'MAP@10' : MAP(k=10), 
}

results = {
    "model_name" : [],
    "map" : []
}

models = {}

In [124]:
for i, (train_ids, test_ids, _) in tqdm(enumerate(folds), total=splitter.get_n_splits(interactions_)):
    df_train = interactions_.df.iloc[train_ids]
    item_feat_train = item_feat[item_feat[Columns.Item].isin(df_train[Columns.Item])]

    dataset = Dataset.construct(
        df_train,
        None,
        item_feat_train,
        cat_item_features=['genre', 'release_year']
    )

    df_test = interactions_.df.iloc[test_ids][Columns.UserItem]
    test_users = np.unique(df_test[Columns.User])
    
    for params in tqdm(parameter_grid):
        model = PopularModel(popularity=params[0], period=pd.to_timedelta(params[1], unit='D'))
        model.fit(dataset)

        recos = model.recommend(
            users = test_users,
            dataset=dataset,
            k=10,
            filter_viewed=True
        )

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train
        )
        
        results['model_name'].append(f"{params[0]}_{params[1]}")
        results['map'].append(metric_values["MAP@10"])
        
        models[f"{params[0]}_{params[1]}"] = model

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

In [128]:
results_df = pd.DataFrame(data=results)

In [129]:
results_df.groupby("model_name").mean().sort_values('map', ascending=False)

Unnamed: 0_level_0,map
model_name,Unnamed: 1_level_1
n_users_5,0.096757
n_interactions_5,0.096757
n_interactions_4,0.096695
n_users_4,0.096695
n_interactions_3,0.096656
...,...
mean_weight_5,0.000106
mean_weight_6,0.000102
mean_weight_21,0.000098
mean_weight_2,0.000095


In [136]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=item_feat,
    cat_item_features=['genre', 'release_year']
)

Модель с параметрами n_users, 5 дает наилучший прирост в метрике

In [152]:
submission_users = sample_submission[sample_submission['user_id'].isin(dataset.user_id_map.external_ids)]['user_id']

In [155]:
recs = models['n_users_5'].recommend(submission_users, dataset, 10, filter_viewed=True)

In [177]:
preds = []
default = [9728, 15297, 10440, 14488, 13865, 12192, 341, 4151, 3734, 512]

In [178]:
for i, user_id in tqdm(enumerate(sample_submission['user_id']), total=len(sample_submission)):
    items = list(recs[recs.user_id == user_id]['item_id'])
    
    if items: 
        preds.append(items)
    else:
        preds.append(default)

  0%|          | 0/193113 [00:00<?, ?it/s]

In [180]:
sample_submission['item_id'] = preds

In [184]:
sample_submission.to_csv("submissions/popular_model.csv", index=False)