In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
[data for data in catalog.list() if not data.startswith('params:')]

In [None]:
[data for data in catalog.list() if data.startswith('params:')]

In [None]:
context.catalog.load('recommendations').head()

### loading data

In [None]:
candidates = context.catalog.load('candidates_similarity_features')

In [None]:
candidates.shape

In [None]:
val_transactions = context.catalog.load('val_transactions')

In [None]:
val_transactions = val_transactions[['customer_id', 'article_id']].assign(label=lambda x: 1)

In [None]:
val_transactions.shape

In [None]:
val_transactions = val_transactions.drop_duplicates()

In [None]:
val_transactions.shape

In [None]:
candidates.groupby(['customer_id'])['article_id'].nunique().describe(percentiles=
                                                                     [0.001, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99])

##### only 1 way needed

sample frac=1 is inefficient, but probably more correct

## maybe keep all positive examples??

candidates = candidates.merge(val_transactions, on=['customer_id', 'article_id'], how='left').fillna({'label': 0})

candidates['label'] = candidates['label'].astype(int)

positive_candidates = candidates.groupby(['customer_id'])['label'].max().reset_index()

positive_candidates.shape

positive_candidates = positive_candidates[positive_candidates.label>0]

positive_candidates.shape

candidates.shape

candidates = candidates[candidates.customer_id.isin(positive_candidates.customer_id.to_list())]

candidates.shape

In [None]:
candidates = candidates.sample(frac=1, random_state=888).groupby(['customer_id']).head(10).reset_index(drop=True)

candidates = candidates.groupby(['customer_id']).sample(n=200, random_state=888).reset_index(drop=True)

In [None]:
candidates.shape

In [None]:
val_transactions.shape

In [None]:
candidates = candidates.merge(val_transactions, on=['customer_id', 'article_id'], how='left').fillna({'label': 0})

In [None]:
candidates['label'] = candidates['label'].astype(int)

In [None]:
candidates.shape

In [None]:
candidates.customer_id.nunique()

In [None]:
candidates.head()

In [None]:
candidates.label.value_counts(normalize=True)

In [None]:
candidates.label.value_counts(normalize=False)

In [None]:
137175296/22704

In [None]:
candidates.groupby(['customer_id'])['article_id'].nunique().describe()

### optimizing data

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64',
               # fucking pandas types
                'Int16', 'Int32', 'Int64', 'Float16', 'Float32', 'Float64'
               ]
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Starting usage memory: {start_mem:.2f}')

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).lower()[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np. float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        # if col_type == 'object':
            # here
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
candidates = reduce_mem_usage(candidates)

### regex

In [None]:
import re

In [None]:
count_pattern = '^[Cc][Oo][Uu][Nn][Tt]'

In [None]:
def fill_na_int(df, regex_pattern, fill_na_value):
    cols = [col for col in df.columns if re.match(regex_pattern, col)]
    df.loc[:, cols] = df.loc[:, cols].fillna(fill_na_value).astype(int)
    return df

### articles

In [None]:
automated_articles_features = context.catalog.load('automated_articles_features')

In [None]:
manual_article_features = context.catalog.load('manual_article_features')

In [None]:
automated_articles_features = fill_na_int(automated_articles_features, count_pattern, 0)

In [None]:
automated_articles_features = reduce_mem_usage(automated_articles_features)

In [None]:
manual_article_features = reduce_mem_usage(manual_article_features)

In [None]:
candidates = candidates.merge(automated_articles_features, how='left', on='article_id')

In [None]:
candidates = candidates.merge(manual_article_features, how='left', on='article_id')

In [None]:
import gc

In [None]:
del automated_articles_features
gc.collect()

In [None]:
del manual_article_features
gc.collect()

In [None]:
candidates.shape

### customers

In [None]:
automated_customers_features = context.catalog.load('automated_customers_features')

In [None]:
manual_customer_features = context.catalog.load('manual_customer_features')

In [None]:
automated_customers_features = fill_na_int(automated_customers_features, count_pattern, 0)

In [None]:
manual_customer_features = fill_na_int(manual_customer_features, count_pattern, 0)

In [None]:
automated_customers_features = reduce_mem_usage(automated_customers_features)

In [None]:
manual_customer_features = reduce_mem_usage(manual_customer_features)

In [None]:
candidates = candidates.merge(automated_customers_features, how='left', on='customer_id')

In [None]:
candidates = candidates.merge(manual_customer_features, how='left', on='customer_id')

In [None]:
candidates = fill_na_int(candidates, count_pattern, 0)

In [None]:
candidates = reduce_mem_usage(candidates)

In [None]:
del automated_customers_features
gc.collect()

In [None]:
del manual_customer_features
gc.collect()

In [None]:
candidates.shape

In [None]:
candidates['strategy_name'] = candidates['strategy_name'].astype('category')

In [None]:
candidates.memory_usage().sum() / 1024**2

### dictionary features

In [None]:
articles = context.catalog.load('articles')

In [None]:
customers = context.catalog.load('customers')

In [None]:
drop_cols_articles = ['prod_name', 'product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc']

In [None]:
# these cols are just the same values as _no (OR SIMILAR, HAVEN'T CHECKED) but mostly DUPLICATES
articles.drop(drop_cols_articles, axis=1, inplace=True)

In [None]:
articles = reduce_mem_usage(articles)

In [None]:
customers = reduce_mem_usage(customers)

In [None]:
candidates = candidates.merge(customers, how='left', on='customer_id')

In [None]:
import gc

In [None]:
del customers
gc.collect()

In [None]:
candidates = candidates.merge(articles, how='left', on='article_id')

In [None]:
del articles
gc.collect()

In [None]:
candidates = reduce_mem_usage(candidates)

In [None]:
candidates.shape

In [None]:
candidates.loc[:, ['index_code', 'club_member_status',
       'fashion_news_frequency', 'postal_code', 'product_group_name',
       'department_name', 'FN', 'Active']] = candidates.loc[:, ['index_code',
                                                                'club_member_status',
       'fashion_news_frequency', 'postal_code', 'product_group_name',
       'department_name', 'FN', 'Active']].astype('category')

In [None]:
candidates.loc[:, ['index_code', 'product_group_name', 'department_name']] = candidates.loc[:, ['index_code', 'product_group_name', 'department_name']].astype('category')

In [None]:
candidates.loc[:, ['club_member_status', 'fashion_news_frequency', 'postal_code', 'FN', 'Active']] = (
    candidates.loc[:, ['club_member_status', 'fashion_news_frequency', 'postal_code', 'FN', 'Active']].astype('category')
)

In [None]:
candidates.memory_usage().sum() / 1024**2

### import lightgbm

In [None]:
import lightgbm as lgb

In [None]:
candidates.select_dtypes(include='category').columns

In [None]:
# train positive rate
candidates.label.mean() * 10

In [None]:
features = [col for col in candidates.columns if col not in ['label', 'customer_id', 'article_id']]

In [None]:
cat_features = candidates.select_dtypes(include='category').columns.to_list()

In [None]:
cat_features

### ranker

In [None]:
params = {
    "objective": "lambdarank",
    "boosting_type": "gbdt",
    "metric": "map",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,
    "verbose": 1,
    "eval_at": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    # 'device':'gpu'
}

In [None]:
candidates.head()

In [None]:
df_split = candidates.groupby(['customer_id'])['label'].max().reset_index()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_candidates, val_candidates = train_test_split(df_split, test_size=0.15, random_state=42, stratify=df_split['label'])

In [None]:
train_candidates = candidates[candidates.customer_id.isin(train_candidates['customer_id'].unique())]

In [None]:
val_candidates = candidates[candidates.customer_id.isin(val_candidates['customer_id'].unique())]

In [None]:
del candidates
gc.collect()

In [None]:
# train positive rate
train_candidates.label.mean() * 10

In [None]:
# val positive rate
val_candidates.label.mean() * 10

##### groups

In [None]:
train_group = train_candidates[['customer_id', 'article_id']]

In [None]:
train_group = train_group.groupby(['customer_id']).size().values

In [None]:
val_group = val_candidates[['customer_id', 'article_id']]

In [None]:
val_group = val_group.groupby(['customer_id']).size().values

##### datasets

In [None]:
train_set = lgb.Dataset(
        data=train_candidates[features],
        label=train_candidates["label"],
        group=train_group,
        feature_name=features,
        categorical_feature=cat_features,
    )

In [None]:
val_set = lgb.Dataset(
        data=val_candidates[features],
        label=val_candidates["label"],
        group=val_group,
        feature_name=features,
        categorical_feature=cat_features,
    )

In [None]:
ranker = lgb.train(
        params,
        train_set,
        valid_sets=[train_set, val_set],
        valid_names=['train', 'valid'],
        num_boost_round=500,
        callbacks=[lgb.early_stopping(stopping_rounds=10)]
    )

In [None]:
ranker.best_score

In [None]:
ranker.save_model('ranker_model_20220824.txt',
        num_iteration=ranker.best_iteration,
    )

In [None]:
import numpy as np


def _ap_at_k(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if actual is None:
        return 0.0

    return score / min(len(actual), k)



def map_at_k(actual, predicted, k=12):
    """Compute mean average precision @ k.
    Parameters
    ----------
    actual : Iterable
        Label.
    predicted : Iterable
        Predictions.
    k : int, optional
        k, by default ``12``.
    Returns
    -------
    float
        MAP@k.
    """
    return np.mean(
        [_ap_at_k(a, p, k) for a, p in zip(actual, predicted) if a is not None]
    )

In [None]:
def predict(ranker, candidates):
    candidates['prob'] = ranker.predict(candidates.drop(['customer_id', 'article_id', 'label'], axis=1))
    pred_lgb = candidates[['customer_id', 'article_id', 'prob']].sort_values(by=['customer_id', 'prob'], ascending=False).reset_index(drop=True)
    pred_lgb = pred_lgb.groupby(['customer_id']).head(12)
    return pred_lgb.groupby(['customer_id'])['article_id'].apply(list).reset_index()

In [None]:
ranker.best_score

In [None]:
ranker.best_score

In [None]:
val_candidates.drop(['prob'],axis=1, inplace=True)

In [None]:
preds = predict(ranker, val_candidates)

In [None]:
preds

In [None]:
preds['dsada'] = preds.article_id.sample(frac=1).reset_index(drop=True)

In [None]:
preds.columns = ['customer_id', 'prediction_0', 'prediction_1']

In [None]:
preds.shape

In [None]:
preds.head()

In [None]:
preds['prediction_0'] = preds['prediction_0'].apply(lambda x: ' '.join(x))
preds['prediction_1'] = preds['prediction_1'].apply(lambda x: ' '.join(x))

In [None]:
preds.head()

In [None]:
def weight_predictions(preds):
    rec = []
    rec.append(preds['prediction_0'])
    rec.append(preds['prediction_1'])
    
    res = dict()
    for prediction_no in range(len(rec)):
        for i, article_id in enumerate(rec[prediction_no]):
            res[article_id] = res.get(article_id, 0) + 1/(i+1)
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    res = ' '.join(res[:12])
    return res

In [None]:
def weight_predictions(preds):
    #Create a list of all model predictions
    REC = []

    # Second Try
    REC.append(preds['prediction_0'])
    REC.append(preds['prediction_1'])

    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, article_id in enumerate(REC[M]):
            res[article_id] = res.get(article_id, 0) + 1/(n+1)

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    res = ' '.join(res[:12])

    # Return the top 12 items only
    return res

In [None]:
preds['prediction'] = preds.apply(weight_predictions, axis=1)

In [None]:
list(enumerate(preds.iloc[0].prediction_0))

In [None]:
list(enumerate(preds.iloc[0].prediction_1))

In [None]:
list(enumerate(preds.iloc[0].prediction.split()))

In [None]:
preds

In [None]:
preds

In [None]:
res

In [None]:
res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

In [None]:
def cust_blend(dt, W = [1,1,1,1]):
    #Create a list of all model predictions
    REC = []

    # Second Try
    REC.append(dt['large_rank'])
    REC.append(dt['large_binary'])
    REC.append(dt['small_rank'])
    REC.append(dt['small_binary'])

    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    # Return the top 12 items only
    return res


In [None]:
len(res)

In [None]:
list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

In [None]:
def cust_blend(dt, W = [1,1,1]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    REC.append(dt['prediction2'].split())
    
    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            res[v] = res.get(v, 1/n+1)
            if v in res:
                res[v] = re(W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 itens only
    return ' '.join(res[:12])

sub0['prediction'] = sub0.apply(cust_blend, W = [1.05,1.00,0.95], axis=1)
"""

In [None]:
preds = (
    val_transactions
        .groupby(['customer_id'])['article_id']
        .apply(list)
        .reset_index()
        .merge(preds, on='customer_id', how='inner')
)

In [None]:
preds.columns = ['customer_id', 'y_true', 'y_pred']

In [None]:
preds.head()

In [None]:
map_at_k(preds['y_true'], preds['y_pred'], k=12)

train

In [None]:
train_candidates.drop(['prob'],axis=1, inplace=True)

In [None]:
preds = predict(ranker, train_candidates)

In [None]:
preds

In [None]:
preds = (
    val_transactions
        .groupby(['customer_id'])['article_id']
        .apply(list)
        .reset_index()
        .merge(preds, on='customer_id', how='inner')
)

In [None]:
preds

In [None]:
preds.columns = ['customer_id', 'y_true', 'y_pred']

In [None]:
preds.head()

In [None]:
map_at_k(preds['y_true'], preds['y_pred'], k=12)

### binary model

In [None]:
params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": "auc",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,
    "verbose": 1,
    "eval_at": 12,
    # 'device':'gpu'
}

In [None]:
df_split = candidates.groupby(['customer_id'])['label'].max().reset_index()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_candidates, val_candidates = train_test_split(df_split, test_size=0.15, random_state=42, stratify=df_split['label'])

In [None]:
train_candidates = candidates[candidates.customer_id.isin(train_candidates['customer_id'].unique())]

In [None]:
val_candidates = candidates[candidates.customer_id.isin(val_candidates['customer_id'].unique())]

In [None]:
del candidates
gc.collect()

In [None]:
# train positive rate
train_candidates.label.mean() * 10

In [None]:
# val positive rate
val_candidates.label.mean() * 10

In [None]:
train_set = lgb.Dataset(
        data=train_candidates[features],
        label=train_candidates["label"],
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )

In [None]:
val_set = lgb.Dataset(
        data=val_candidates[features],
        label=val_candidates["label"],
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )

In [None]:
ranker = lgb.train(
        params,
        train_set,
        valid_sets=[train_set, val_set],
        valid_names=['train', 'valid'],
        num_boost_round=500,
        callbacks=[lgb.early_stopping(stopping_rounds=10)]
    )

In [None]:
ranker.save_model('binary_model_20220824.txt',
        num_iteration=ranker.best_iteration,
    )

val

In [None]:
val_candidates.drop(['prob'],axis=1, inplace=True)

In [None]:
preds = predict(ranker, val_candidates)

In [None]:
preds

In [None]:
preds.columns = ['customer_id', 'y_pred']

In [None]:
preds = (
    val_transactions
        .groupby(['customer_id'])['article_id']
        .apply(list)
        .reset_index()
        .merge(preds, on='customer_id', how='inner')
)

In [None]:
preds

In [None]:
preds.columns = ['customer_id', 'y_true', 'y_pred']

In [None]:
preds.head()

In [None]:
map_at_k(preds['y_true'], preds['y_pred'], k=12)

train

In [None]:
train_candidates.drop(['prob'],axis=1, inplace=True)

In [None]:
preds = predict(ranker, train_candidates)

In [None]:
preds

In [None]:
preds = (
    val_transactions
        .groupby(['customer_id'])['article_id']
        .apply(list)
        .reset_index()
        .merge(preds, on='customer_id', how='inner')
)

In [None]:
preds

In [None]:
preds.columns = ['customer_id', 'y_true', 'y_pred']

In [None]:
preds.head()

In [None]:
map_at_k(preds['y_true'], preds['y_pred'], k=12)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

feat_importance = pd.DataFrame(
    {"feature": features, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)

plt.figure(figsize=(8, 24))
sns.barplot(y="feature", x="importance", data=feat_importance)

def predict(ranker, candidates, batch_size = 10_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, features])
        probs[batch : batch + batch_size] = outputs
    candidates['prob'] = probs
    pred_lgb = val_candidates[['customer_id', 'article_id', 'prob']].sort_values(by=['customer_id', 'prob'], ascending=False).reset_index(drop=True)
    pred_lgb = pred_lgb.groupby(['customer_id']).head(12)
    return pred_lgb.groupby(['customer_id'])['article_id'].apply(list).reset_index()

### Stratified Group K Fold

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
sgkf = StratifiedGroupKFold(n_splits=4)

In [None]:
for i, (train_idxs, val_idxs) in enumerate(sgkf.split(candidates.index, candidates.label, candidates.customer_id)):
    print(i)
    # train
    train_group = candidates.loc[train_idxs, :][['customer_id', 'article_id']]
    train_group = train_group.groupby(['customer_id']).size().values
    
    train_set = lgb.Dataset(
        data=candidates.loc[train_idxs, :][features],
        label=candidates.loc[train_idxs, :]["label"],
        group=train_group,
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )
    
    # val
    val_group = candidates.loc[val_idxs, :][['customer_id', 'article_id']]
    val_group = val_group.groupby(['customer_id']).size().values
    
    val_set = lgb.Dataset(
        data=candidates.loc[val_idxs, :][features],
        label=candidates.loc[val_idxs, :]["label"],
        group=val_group,
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )
    
    ranker = lgb.train(
        params,
        train_set,
        valid_sets=[train_set, val_set],
        valid_names=['train', 'valid'],
        num_boost_round=30,
    )

In [None]:
ranker

In [None]:
probs = np.zeros(val_candidates.shape[0])

In [None]:
probs

In [None]:
for batch in range(0, val_candidates.shape[0], 5_000_000):
    outputs = ranker.predict(val_candidates.loc[batch : batch + 5_000_000 - 1, features])
    probs[batch : batch + 5_000_000] = outputs

In [None]:
val_candidates["prob"] = probs

In [None]:
pred_lgb = val_candidates[['customer_id', 'article_id', 'prob']].sort_values(by=['customer_id', 'prob'], ascending=False).reset_index(drop=True)

In [None]:
pred_lgb

In [None]:
pred_lgb.prob.describe()

In [None]:
pred_lgb = pred_lgb.groupby(['customer_id']).head(12)

pred_lgb.loc[:, ['customer_id', 'article_id']] = pred_lgb.loc[:, ['customer_id', 'article_id']].astype(str)

In [None]:
pred_lgb.customer_id.nunique()

In [None]:
pred_lgb.head(13)

In [None]:
pred_lgb.groupby(['customer_id'])['article_id'].apply(list).reset_index()

### Stratified Group Split

##### groups

In [None]:
train_group = train_candidates[['customer_id', 'article_id']]

In [None]:
train_group = train_group.groupby(['customer_id']).size().values

In [None]:
val_group = val_candidates[['customer_id', 'article_id']]

In [None]:
val_group = val_group.groupby(['customer_id']).size().values

##### datasets

In [None]:
train_set = lgb.Dataset(
        data=train_candidates[features],
        label=train_candidates["label"],
        group=train_group,
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )

In [None]:
val_set = lgb.Dataset(
        data=val_candidates[features],
        label=val_candidates["label"],
        group=val_group,
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )

del candidates
gc.collect()

In [None]:
ranker = lgb.train(
        params,
        train_set,
        valid_sets=[val_set],
        num_boost_round=30,
    )

In [None]:
ranker

In [None]:
feat_importance = pd.DataFrame(
    {"feature": features, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(8, 24))
sns.barplot(y="feature", x="importance", data=feat_importance)

In [None]:
def predict(ranker, candidates, batch_size = 1_000_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, features])
        probs[batch : batch + batch_size] = outputs
    candidates["prob"] = probs
    pred_lgb = candidates[['customer_id', 'article_id', 'prob']]
    pred_lgb = pred_lgb.sort_values(by=['customer_id', 'prob'], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'article_id': 'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb = pred_lgb.groupby("customer_id")["prediction"].apply(list).reset_index()
    return pred_lgb

In [None]:
pred = predict(ranker, val_candidates)

In [None]:
pred

In [None]:
probs = np.zeros(val_candidates.shape[0])

In [None]:
probs

In [None]:
for batch in range(0, val_candidates.shape[0], 5_000_000):
    outputs = ranker.predict(val_candidates.loc[batch : batch + 5_000_000 - 1, features])
    probs[batch : batch + 5_000_000] = outputs

In [None]:
val_candidates["prob"] = probs

In [None]:
pred_lgb = val_candidates[['customer_id', 'article_id', 'prob']].sort_values(by=['customer_id', 'prob'], ascending=False).reset_index(drop=True)

In [None]:
pred_lgb

In [None]:
pred_lgb.prob.describe()

In [None]:
pred_lgb = pred_lgb.groupby(['customer_id']).head(12)

pred_lgb.loc[:, ['customer_id', 'article_id']] = pred_lgb.loc[:, ['customer_id', 'article_id']].astype(str)

In [None]:
pred_lgb.customer_id.nunique()

In [None]:
pred_lgb.head(13)

In [None]:
pred_lgb.groupby(['customer_id'])['article_id'].apply(list).reset_index()