# Imports

In [1]:
from typing import List
import warnings
warnings.simplefilter('ignore')
from datetime import timedelta
from itertools import product
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from statistics import mode
from rectools import Columns
from rectools.models import PopularModel, PopularInCategoryModel
from rectools.metrics import calc_metrics, MAP
from rectools.dataset import Dataset, Interactions
from rectools.model_selection import TimeRangeSplitter

# Loading data

In [3]:
interactions = pd.read_csv("data/interactions.csv")
items = pd.read_csv("data/items.csv")
users = pd.read_csv("data/users.csv") 

In [6]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [7]:
sample_submission

Unnamed: 0,user_id,item_id
0,3,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
1,11,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
2,29,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
3,30,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
4,33,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
...,...,...
193108,1097527,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
193109,1097537,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
193110,1097538,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
193111,1097544,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."


# Data preprocessing

In [8]:
interactions.rename(
    columns={
        'track_id' : Columns.Item,
        'last_watch_dt' : Columns.Datetime,
        'total_dur' : Columns.Weight
    },
    inplace=True
)

In [9]:
_, bins = pd.qcut(items["release_year"], 10, retbins=True)
labels = bins[:-1]

year_feature = pd.DataFrame(
    {
        Columns.Item: items[Columns.Item],
        "value": pd.cut(items["release_year"], bins=bins, labels=bins[:-1]),
        "feature": "release_year",
    }
)
year_feature.head()

Unnamed: 0,item_id,value,feature
0,10711,1983.0,release_year
1,2508,2012.0,release_year
2,10716,2009.0,release_year
3,7868,2014.0,release_year
4,16268,1897.0,release_year


In [10]:
items['genre'] = items['genres'].str.split(",")

In [11]:
genre_feature = items[[Columns.Item, "genre"]].explode("genre")
genre_feature.columns = [Columns.Item, "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,item_id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [12]:
item_feat = pd.concat([genre_feature, year_feature])
item_feat = item_feat[item_feat[Columns.Item].isin(interactions[Columns.Item])]

In [13]:
items['director'] = items['directors'].str.split(",")
director_feature = items[[Columns.Item, "director"]].explode("director")
director_feature.columns = [Columns.Item, "value"]
director_feature["feature"] = "director"
director_feature.tail()

Unnamed: 0,item_id,value,feature
15960,10632,Амир Камдин,director
15960,10632,Эрик Эгер,director
15961,4538,Марк О’Коннор,director
15961,4538,Конор МакМахон,director
15962,3206,Михаил Миронов,director


In [14]:
item_feat = pd.concat([genre_feature, year_feature, director_feature])
item_feat = item_feat[item_feat[Columns.Item].isin(interactions[Columns.Item])]

In [15]:
item_feat

Unnamed: 0,item_id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15960,10632,Амир Камдин,director
15960,10632,Эрик Эгер,director
15961,4538,Марк О’Коннор,director
15961,4538,Конор МакМахон,director


In [16]:
n_folds = 3
unit = "D"
n_units = 7
# last_date = interactions[Columns.Datetime].max().normalize()
last_date = pd.to_datetime('15-08-2021', format='%d-%m-%Y')


start_date = last_date - pd.Timedelta(n_folds * n_units - 1, unit=unit)
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(f"""
    start_date: {start_date}
    last_date: {last_date}
    periods: {periods}
    freq: {freq}
""")

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)


    start_date: 2021-07-26 00:00:00
    last_date: 2021-08-15 00:00:00
    periods: 4
    freq: 7D



In [17]:
splitter = TimeRangeSplitter(date_range=date_range)
interactions_ = Interactions(interactions)

In [18]:
interactions

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76.0,0.0
5476247,546862,9673,2021-04-13,2308.0,49.0
5476248,697262,15297,2021-08-20,18307.0,63.0
5476249,384202,16197,2021-04-19,6203.0,100.0


In [58]:
whole_dataset = Dataset.construct(
    interactions_df=interactions
)

whole_user_item_matrix = whole_dataset.get_user_item_matrix()

# Hypeparameter tuning and fitting simple Popular model

In [121]:
model_parameters = {
    "popularity": ["n_users", "n_interactions", "sum_weight", "mean_weight"], 
    "period" : [None] + [i for i in range(1, 22)]
}

In [122]:
parameter_grid = tuple(product(*list(model_parameters.values())))

In [123]:
folds = splitter.split(interactions_)

metrics = {
    'MAP@10' : MAP(k=10), 
}

results = {
    "model_name" : [],
    "map" : []
}

models = {}

In [124]:
for i, (train_ids, test_ids, _) in tqdm(enumerate(folds), total=splitter.get_n_splits(interactions_)):
    df_train = interactions_.df.iloc[train_ids]
    item_feat_train = item_feat[item_feat[Columns.Item].isin(df_train[Columns.Item])]

    dataset = Dataset.construct(
        df_train,
        None,
        item_feat_train,
        cat_item_features=['genre', 'release_year']
    )

    df_test = interactions_.df.iloc[test_ids][Columns.UserItem]
    test_users = np.unique(df_test[Columns.User])
    
    for params in tqdm(parameter_grid):
        model = PopularModel(popularity=params[0], period=pd.to_timedelta(params[1], unit='D'))
        model.fit(dataset)

        recos = model.recommend(
            users = test_users,
            dataset=dataset,
            k=10,
            filter_viewed=True
        )t

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train
        )
        
        results['model_name'].append(f"{params[0]}_{params[1]}")
        results['map'].append(metric_values["MAP@10"])
        
        models[f"{params[0]}_{params[1]}"] = model

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

In [128]:
results_df = pd.DataFrame(data=results)

In [129]:
results_df.groupby("model_name").mean().sort_values('map', ascending=False)

Unnamed: 0_level_0,map
model_name,Unnamed: 1_level_1
n_users_5,0.096757
n_interactions_5,0.096757
n_interactions_4,0.096695
n_users_4,0.096695
n_interactions_3,0.096656
...,...
mean_weight_5,0.000106
mean_weight_6,0.000102
mean_weight_21,0.000098
mean_weight_2,0.000095


In [136]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=item_feat,
    cat_item_features=['genre', 'release_year']
)

Модель с параметрами n_users, 5 дает наилучший прирост в метрике

In [88]:
submission_users = sample_submission[sample_submission['user_id'].isin(dataset.user_id_map.external_ids)]['user_id']

In [155]:
recs = models['n_users_5'].recommend(submission_users, dataset, 10, filter_viewed=True)

In [162]:
preds = []
default = [9728, 15297, 10440, 14488, 13865, 12192, 341, 4151, 3734, 512]

In [178]:
for i, user_id in tqdm(enumerate(sample_submission['user_id']), total=len(sample_submission)):
    items = list(recs[recs.user_id == user_id]['item_id'])
    
    if items: 
        preds.append(items)
    else:
        preds.append(default)

  0%|          | 0/193113 [00:00<?, ?it/s]

In [180]:
sample_submission['item_id'] = preds

In [184]:
sample_submission.to_csv("submissions/popular_model.csv", index=False)

# Hypeparameter tuning and fitting Popular among N percent of users model

In [61]:
def get_top_items_covered_users(matrix, n_users=1000) -> np.ndarray:
    
        assert matrix.format == 'csr'

        item_set = []
        covered_users = np.zeros(matrix.shape[0], dtype=bool)
        while covered_users.sum() < n_users: 
            top_item = mode(matrix[~covered_users].indices)
            item_set.append(top_item)
            covered_users += np.maximum.reduceat(matrix.indices==top_item, 
                                                 matrix.indptr[:-1], 
                                                 dtype=bool) 
        return item_set

In [62]:
model_parameters = {
    "n_users" : [1000, 10000, 45000, 100000, 250000, 500000, 650000], 
    "popularity": ["n_users", "n_interactions", "sum_weight", "mean_weight"], 
}

In [85]:
parameter_grid = list(product(*list(model_parameters.values())))

In [81]:
metrics = {
    'MAP@10' : MAP(k=10), 
}

results = {
    "model_name" : [],
    "map" : []
}

models = {}

for params in tqdm(parameter_grid):
    top_items = get_top_items_covered_users(whole_user_item_matrix, params[0])
    folds = splitter.split(interactions_)
    
    for i, (train_ids, test_ids, _) in enumerate(folds):
        df_train = interactions_.df.iloc[train_ids]
        item_feat_train = item_feat[item_feat[Columns.Item].isin(df_train[Columns.Item])]
        item_feat_train = item_feat_train[item_feat_train[Columns.Item].isin(top_items)]

        dataset = Dataset.construct(
            df_train,
            None,
            item_feat_train,
            cat_item_features=['genre', 'release_year']
        )

        df_test = interactions_.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        model = PopularModel(popularity=params[1])
        model.fit(dataset)

        recos = model.recommend(
            dataset=dataset,
            users = test_users,
            k=10,
            filter_viewed=True
        )

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train
        )

        results['model_name'].append(f"{params[0]}_{params[1]}")
        results['map'].append(metric_values["MAP@10"])

        models[f"{params[0]}_{params[1]}"] = model

  0%|          | 0/28 [00:00<?, ?it/s]

In [86]:
results_df = pd.DataFrame(data=results)
results_df

Unnamed: 0,model_name,map
0,1000_n_users,9.040341e-02
1,1000_n_users,8.960530e-02
2,1000_n_users,8.260707e-02
3,1000_n_interactions,9.040341e-02
4,1000_n_interactions,8.960530e-02
...,...,...
79,650000_sum_weight,8.421890e-02
80,650000_sum_weight,7.647100e-02
81,650000_mean_weight,3.512099e-06
82,650000_mean_weight,4.583231e-07


In [89]:
submission_users = sample_submission[sample_submission['user_id'].isin(dataset.user_id_map.external_ids)]['user_id']

In [91]:
results_df.groupby("model_name").mean().sort_values('map', ascending=False).head()

Unnamed: 0_level_0,map
model_name,Unnamed: 1_level_1
250000_n_users,0.087539
45000_n_interactions,0.087539
100000_n_users,0.087539
650000_n_users,0.087539
650000_n_interactions,0.087539


In [92]:
recs = models['250000_n_users'].recommend(submission_users, dataset, 10, filter_viewed=True)

In [93]:
preds = []
default = [9728, 15297, 10440, 14488, 13865, 12192, 341, 4151, 3734, 512]

In [94]:
for i, user_id in tqdm(enumerate(sample_submission['user_id']), total=len(sample_submission)):
    items = list(recs[recs.user_id == user_id]['item_id'])
    
    if items: 
        preds.append(items)
    else:
        preds.append(default)

  0%|          | 0/193113 [00:00<?, ?it/s]

In [95]:
sample_submission['item_id'] = preds

In [96]:
sample_submission.to_csv("submissions/popular_among_n_unique_users_model.csv", index=False)

# Hyperparameter tuning and fitting Most Polular in category models on item features

In [152]:
model_parameters = {
    "period" : [None] + [i for i in range(1, 10)],
    "category_feature" : ['genre', 'release_year', 'director'],
    "n_categories" : [3,4,5]
}

In [153]:
parameter_grid = list(product(*list(model_parameters.values())))

In [154]:
folds = splitter.split(interactions_)

metrics = {
    'MAP@10' : MAP(k=10), 
}

results = {
    "model_name" : [],
    "map" : []
}

models = {}

for i, (train_ids, test_ids, _) in tqdm(enumerate(folds), total=splitter.get_n_splits(interactions_)):
    df_train = interactions_.df.iloc[train_ids]
    item_feat_train = item_feat[item_feat[Columns.Item].isin(df_train[Columns.Item])]

    dataset = Dataset.construct(
        df_train,
        None,
        item_features_df=item_feat_train,
        cat_item_features=['genre', 'release_year', 'director']
    )

    df_test = interactions_.df.iloc[test_ids][Columns.UserItem]
    test_users = np.unique(df_test[Columns.User])
    
    for params in tqdm(parameter_grid):
        model = PopularInCategoryModel(period=pd.to_timedelta(params[0], unit='D'),
                                       category_feature=params[1],
                                       n_categories=params[2])
        model.fit(dataset)

        recos = model.recommend(
            users = test_users,
            dataset=dataset,
            k=10,
            filter_viewed=True
        )

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train
        )
        
        results['model_name'].append(f"{params[0]}_{params[1]}_{params[2]}")
        results['map'].append(metric_values["MAP@10"])
        
        models[f"{params[0]}_{params[1]}_{params[2]}"] = model

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

In [156]:
results_df = pd.DataFrame(data=results)

In [159]:
results_df.groupby("model_name").mean().sort_values('map', ascending=False)

Unnamed: 0_level_0,map
model_name,Unnamed: 1_level_1
7_release_year_4,0.075586
3_genre_5,0.075541
4_genre_5,0.075461
5_genre_5,0.075409
8_release_year_4,0.075399
...,...
3_director_3,0.064431
2_director_3,0.064430
5_director_3,0.063816
4_director_3,0.063816


In [165]:
dataset = Dataset.construct(
        interactions,
        None,
        item_features_df=item_feat,
        cat_item_features=['genre', 'release_year', 'director']
    )

In [172]:
recs = models['7_release_year_4'].recommend(submission_users, dataset, 10, filter_viewed=True)

In [174]:
preds = []

for i, user_id in tqdm(enumerate(sample_submission['user_id']), total=len(sample_submission)):
    items = list(recs[recs.user_id == user_id]['item_id'])
    
    if items: 
        preds.append(items)
    else:
        preds.append(default)

  0%|          | 0/193113 [00:00<?, ?it/s]

In [175]:
sample_submission['item_id'] = preds

In [176]:
sample_submission.to_csv("submissions/popular_in_category_items.csv", index=False)