## Main points
* Solution should be reasonably simple because the contest is only 24 hours long 
* Metric is based on the prediction of clicked pictures one week ahead, so clicks are the most important information
* More recent information is more important
* Only pictures that were shown to a user could be clicked, so pictures popularity is important
* Metric is MAPK@100
* Link https://contest.yandex.ru/contest/12899/problems (Russian)

## Plan
* Build a classic recommending system based on user click history
* Only use recent days of historical data
* Take into consideration projected picture popularity

## Magic constants
### ALS recommending system:

In [1]:
# Factors for ALS
factors_count=100

# Last days of click history used
trail_days=14 

# number of best candidates generated by ALS 
output_candidates_count=2000 

# Last days of history with more weight
last_days=1

# Coefficient for additional weight
last_days_weight=4

## Popular pictures prediction model:

In [2]:
import lightgbm

lightgbm.__version__

'2.1.2'

In [3]:
popularity_model = lightgbm.LGBMRegressor(seed=0)
heuristic_alpha = 0.2

In [4]:
import datetime
import tqdm
import pandas as pd
from scipy.sparse import coo_matrix

import implicit
implicit.__version__

'0.3.8'

In [5]:
test_users = pd.read_csv('Blitz/test_users.csv')
data = pd.read_csv('Blitz/train_clicks.csv', parse_dates=['day'])

## Split last 7 days to calculate clicks similar to test set


In [6]:
train, target_week = (
    data[data.day <= datetime.datetime(2019, 3, 17)].copy(),
    data[data.day > datetime.datetime(2019, 3, 17)],
)
train.day.nunique(), target_week.day.nunique()

(45, 7)

In [7]:
last_date = train.day.max()
train.loc[:, 'delta_days'] = 1 + (last_date - train.day).apply(lambda d: d.days)

last_date = data.day.max()
data.loc[:, 'delta_days'] = 1 + (last_date - data.day).apply(lambda d: d.days)

In [8]:
def picture_features(data):
    """Generating clicks count for every picture in last days"""
    days = range(1, 3)
    features = []
    names = []
    for delta_days in days:
        features.append(
            data[(data.delta_days == delta_days)].groupby(['picture_id'])['user_id'].count()
        )
        names.append('%s_%d' % ('click', delta_days))
        
    features = pd.concat(features, axis=1).fillna(0)
    features.columns = names
    features = features.reindex(data.picture_id.unique())
    return features.fillna(0)

In [9]:
X = picture_features(train)
X.mean(axis=0)

click_1    0.046854
click_2    0.046599
dtype: float64

In [10]:
def clicks_count(data, index):
    return data.groupby('picture_id')['user_id'].count().reindex(index).fillna(0)
    
y = clicks_count(target_week, X.index)
y.shape, y.mean()

((1070979,), 0.15237180187473331)

## Train a model predicting popular pictures next week

In [11]:
popularity_model.fit(X, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, seed=0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [12]:
X_test = picture_features(data)
X_test.mean(axis=0)

click_1    0.042347
click_2    0.038887
dtype: float64

In [13]:
X_test['p'] = popularity_model.predict(X_test)
X_test.loc[X_test['p'] < 0, 'p'] = 0
X_test['p'].mean()

0.14701204450320687

## Generate dict with predicted clicks for every picture


In [14]:
# This prediction would be used to correct recommender score
picture = dict(X_test['p'])

# Recommender part

## Generate prediction using ALS approach

In [15]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = "1"

def als_baseline(
    train, test_users, 
    factors_n, last_days, trail_days, output_candidates_count, last_days_weight
):
    train = train[train.delta_days <= trail_days].drop_duplicates([
        'user_id', 'picture_id'
    ])
    
    users = train.user_id
    items = train.picture_id
    weights = 1 + last_days_weight * (train.delta_days <= last_days)
    
    user_item = coo_matrix((weights, (users, items)))
    model = implicit.als.AlternatingLeastSquares(factors=factors_n, iterations=factors_n)
    model.fit(user_item.T.tocsr())
    
    user_item_csr = user_item.tocsr()
    
    rows = []
    for user_id in tqdm.tqdm_notebook(test_users.user_id.values):
        items = [(picture_id, score) for picture_id, score in model.recommend(user_id, user_item_csr, N=output_candidates_count)]
        rows.append(items)

    test_users['predictions_full'] = [
        p
        for p, user_id in zip(
            rows,
            test_users.user_id.values
        )
    ]
    test_users['predictions'] = [
        [x[0] for x in p]
        for p, user_id in zip(
            rows,
            test_users.user_id.values
        )
    ]
    return test_users

In [16]:
test_users = als_baseline(
    data, test_users, factors_count, last_days, trail_days, output_candidates_count, last_days_weight)

100%|██████████| 100.0/100 [11:00<00:00,  6.78s/it]


HBox(children=(IntProgress(value=0, max=1380), HTML(value='')))




## Calculate history clicks to exclude them from results. Such clicks are excluded from test set according to task

In [17]:
clicked = data.groupby('user_id').agg({'picture_id': set})

def substract_clicked(p, c):
    filtered = [picture for picture in p if picture not in c][:100]
    return filtered

## Heuristical approach to reweight ALS score according to picture predicted popularity

Recommender returns (picture, score) pairs sorted decreasing for every user.

For every user we replace picture $score_p$ with $score_p \cdot (1 + popularity_{p})^{0.2}$

$popularity_{p}$ - popularity predicted for this picture for next week

This slightly moves popular pictures to the top of list for every user

In [18]:
import math

rows = test_users['predictions_full']

def correct_with_popularity(items, picture, alpha):
    return sorted([
        (score * (1 + picture.get(picture_id, 0)) ** alpha, picture_id, score, picture.get(picture_id, 0)) 
        for picture_id, score in items], reverse=True
    )

corrected_rows = [
    [x[1] for x in correct_with_popularity(items, picture, heuristic_alpha)]
    for items in rows
]

## Submission formatting

In [19]:
test_users['predictions'] = [
    ' '.join(map(str,
        substract_clicked(p, {} if user_id not in clicked.index else clicked.loc[user_id][0])
    ))
    for p, user_id in zip(
        corrected_rows,
        test_users.user_id.values
    )
]

In [20]:
test_users[['user_id', 'predictions']].to_csv('submit.csv', index=False)