In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
catalog.list()

In [None]:
[data for data in catalog.list() if not data.startswith('params:')]

### loading data

In [None]:
candidates = context.catalog.load('candidates_similarity_features')

In [None]:
val_transactions = context.catalog.load('val_transactions')

In [None]:
val_transactions = val_transactions[['customer_id', 'article_id']].assign(label=lambda x: 1)

In [None]:
val_transactions.shape

In [None]:
val_transactions.head()

In [None]:
candidates = candidates.merge(val_transactions, on=['customer_id', 'article_id'], how='left').fillna({'label': 0})

In [None]:
candidates['label'] = candidates['label'].astype(int)

### filter

In [None]:
user_ids = list(candidates.customer_id.unique())[:10_000]

In [None]:
candidates = candidates[candidates.customer_id.isin(user_ids)]

In [None]:
candidates.shape

### downsampling (not used right now)

i mean, it doesn't make sense, 2 millions negative samples = 14.5 per CUSTOMER, so basically we are randomly choosing articles for each customer. Doesn't make sense

In [None]:
neg_samples = 2_000_000
seed = 42

In [None]:
candidates = pd.concat([candidates[candidates['label']>0],
                        candidates[candidates['label']==0].sample(neg_samples, random_state=42)], axis=0)

In [None]:
2_000_000/137_198

In [None]:
candidates.shape

In [None]:
candidates.customer_id.nunique()

### continue

In [None]:
candidates.shape

In [None]:
candidates.customer_id.nunique()

In [None]:
candidates.head()

### optimizing data

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64',
               # fucking pandas types
                'Int16', 'Int32', 'Int64', 'Float16', 'Float32', 'Float64'
               ]
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Starting usage memory: {start_mem:.2f}')

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).lower()[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np. float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        # if col_type == 'object':
            # here
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
candidates = reduce_mem_usage(candidates)

### regex

In [None]:
import re

In [None]:
count_pattern = '^[Cc][Oo][Uu][Nn][Tt]'

In [None]:
def fill_na_int(df, regex_pattern, fill_na_value):
    cols = [col for col in df.columns if re.match(regex_pattern, col)]
    df.loc[:, cols] = df.loc[:, cols].fillna(fill_na_value).astype(int)
    return df

### not now

##### customer_id optimization -> index -> customer_id (mapping)
id_to_index_dict = dict(zip(customers["customer_id"], customers.index))
index_to_id_dict = dict(zip(customers.index, customers["customer_id"]))

#for memory efficiency
transactions["customer_id"] = transactions["customer_id"].map(id_to_index_dict)

#for switching back for submission
sub["customer_id"] = sub["customer_id"].map(index_to_id_dict)

###### article_id optimization str -> int32 -> str
train['article_id'] = train.article_id.astype('int32')
train['article_id'] = '0' + train.article_id.astype('str')

In [None]:
candidates['article_id'] = candidates['article_id'].astype('int32')

In [None]:
candidates.memory_usage().sum() / 1024**2

### continue

### articles

In [None]:
automated_articles_features = context.catalog.load('automated_articles_features')

In [None]:
manual_article_features = context.catalog.load('manual_article_features')

In [None]:
automated_articles_features = fill_na_int(automated_articles_features, count_pattern, 0)

In [None]:
automated_articles_features.shape, manual_article_features.shape

In [None]:
automated_articles_features.dtypes

In [None]:
automated_articles_features = reduce_mem_usage(automated_articles_features)

In [None]:
automated_articles_features.dtypes

In [None]:
manual_article_features.dtypes

In [None]:
manual_article_features = reduce_mem_usage(manual_article_features)

In [None]:
manual_article_features.dtypes

In [None]:
candidates = candidates.merge(automated_articles_features, how='left', on='article_id').merge(manual_article_features, how='left', on='article_id')

In [None]:
import gc

In [None]:
del automated_articles_features
gc.collect()

In [None]:
del manual_article_features
gc.collect()

In [None]:
candidates.shape

### customers

In [None]:
automated_customers_features = context.catalog.load('automated_customers_features')

In [None]:
manual_customer_features = context.catalog.load('manual_customer_features')

In [None]:
automated_customers_features = fill_na_int(automated_customers_features, count_pattern, 0)

In [None]:
manual_customer_features = fill_na_int(manual_customer_features, count_pattern, 0)

In [None]:
automated_customers_features.dtypes

In [None]:
automated_customers_features = reduce_mem_usage(automated_customers_features)

In [None]:
automated_customers_features.dtypes

In [None]:
manual_customer_features.dtypes

In [None]:
manual_customer_features = reduce_mem_usage(manual_customer_features)

In [None]:
manual_customer_features.dtypes

In [None]:
manual_customer_features.isna().sum()

In [None]:
automated_customers_features.shape, manual_customer_features.shape

In [None]:
candidates = candidates.merge(automated_customers_features, how='left', on='customer_id').merge(manual_customer_features, how='left', on='customer_id')

In [None]:
candidates = fill_na_int(candidates, count_pattern, 0)

In [None]:
candidates = reduce_mem_usage(candidates)

In [None]:
del automated_customers_features
gc.collect()

In [None]:
del manual_customer_features
gc.collect()

In [None]:
candidates.shape

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
candidates.dtypes

In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
candidates['strategy_name'] = candidates['strategy_name'].astype('category')

In [None]:
candidates.memory_usage().sum() / 1024**2

### dictionary features

In [None]:
articles = context.catalog.load('articles')

In [None]:
customers = context.catalog.load('customers')

In [None]:
drop_cols_articles = ['prod_name', 'product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc']

In [None]:
# these cols are just the same values as _no (OR SIMILAR, HAVEN'T CHECKED) but mostly DUPLICATES
articles.drop(drop_cols_articles, axis=1, inplace=True)

In [None]:
articles.dtypes

In [None]:
articles = reduce_mem_usage(articles)

In [None]:
articles.dtypes

In [None]:
customers.dtypes

In [None]:
customers = reduce_mem_usage(customers)

In [None]:
customers.dtypes

In [None]:
candidates = candidates.merge(customers, how='left', on='customer_id')

In [None]:
del customers
gc.collect()

In [None]:
candidates = candidates.merge(articles, how='left', on='article_id')

In [None]:
del articles
gc.collect()

In [None]:
candidates = reduce_mem_usage(candidates)

In [None]:
candidates.shape

In [None]:
candidates.loc[:, ['index_code', 'club_member_status',
       'fashion_news_frequency', 'postal_code', 'product_group_name',
       'department_name', 'FN', 'Active']] = candidates.loc[:, ['index_code',
                                                                'club_member_status',
       'fashion_news_frequency', 'postal_code', 'product_group_name',
       'department_name', 'FN', 'Active']].astype('category')

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
candidates.dtypes

In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
candidates.memory_usage().sum() / 1024**2

### why would anybody do that?

lol, bad

In [None]:
candidates = candidates.convert_dtypes(convert_string=False, convert_floating=False)

In [None]:
candidates.memory_usage().sum() / 1024**2

### import lightgbm

In [None]:
import lightgbm as lgb

In [None]:
candidates.select_dtypes(include='category').columns

In [None]:
# train positive rate
candidates.label.mean() * 10

# 
ignore customers with full negative samples
candidates.groupby(['customer_id'])['label'].max().reset_index().where()

In [None]:
features = [col for col in candidates.columns if col not in ['label', 'customer_id', 'article_id']]

In [None]:
cat_features = candidates.select_dtypes(include='category').columns.to_list()

In [None]:
cat_features = ['strategy_name',
 'FN',
 'Active',
 'club_member_status',
 'fashion_news_frequency',
 'postal_code',
 'product_group_name',
 'department_name',
 'index_code']

In [None]:
cat_features

In [None]:
params = {
    "objective": "lambdarank",
    "boosting_type": "gbdt",
    "metric": "map",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,
    "verbose": -1,
    "eval_at": 12,
    # 'device':'gpu'
}

In [None]:
validation_customers = pd.DataFrame(candidates.customer_id.unique()).sample(frac=0.2).iloc[:, 0].values

In [None]:
len(validation_customers)

In [None]:
train_candidates = candidates[~(candidates.customer_id.isin(validation_customers))]
val_candidates = candidates[candidates.customer_id.isin(validation_customers)]

In [None]:
train_candidates.shape, val_candidates.shape

In [None]:
train_candidates.customer_id.nunique(), val_candidates.customer_id.nunique()

##### groups

In [None]:
train_group = train_candidates[['customer_id', 'article_id']]

In [None]:
train_group = train_group.groupby(['customer_id']).size().values

In [None]:
val_group = val_candidates[['customer_id', 'article_id']]

In [None]:
val_group = val_group.groupby(['customer_id']).size().values

##### datasets

In [None]:
train_set = lgb.Dataset(
        data=train_candidates[features],
        label=train_candidates["label"],
        group=train_group,
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )

In [None]:
val_set = lgb.Dataset(
        data=val_candidates[features],
        label=val_candidates["label"],
        group=val_group,
        feature_name=features,
        categorical_feature=cat_features,
        params=params,
    )

del candidates
gc.collect()

In [None]:
ranker = lgb.train(
        params,
        train_set,
        valid_sets=[val_set],
        num_boost_round=30,
    )

In [None]:
ranker

In [None]:
feat_importance = pd.DataFrame(
    {"feature": features, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(8, 24))
sns.barplot(y="feature", x="importance", data=feat_importance)

In [None]:
def predict(ranker, candidates, batch_size = 1_000_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, features])
        probs[batch : batch + batch_size] = outputs
    candidates["prob"] = probs
    pred_lgb = candidates[['customer_id', 'article_id', 'prob']]
    pred_lgb = pred_lgb.sort_values(by=['customer_id', 'prob'], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'article_id': 'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb = pred_lgb.groupby("customer_id")["prediction"].apply(list).reset_index()
    return pred_lgb

In [None]:
pred = predict(ranker, val_candidates)

In [None]:
pred

In [None]:
probs = np.zeros(val_candidates.shape[0])

In [None]:
probs

In [None]:
for batch in range(0, val_candidates.shape[0], 5_000_000):
    outputs = ranker.predict(val_candidates.loc[batch : batch + 5_000_000 - 1, features])
    probs[batch : batch + 5_000_000] = outputs

In [None]:
val_candidates["prob"] = probs

In [None]:
pred_lgb = val_candidates[['customer_id', 'article_id', 'prob']].sort_values(by=['customer_id', 'prob'], ascending=False).reset_index(drop=True)

In [None]:
pred_lgb

In [None]:
pred_lgb.prob.describe()

In [None]:
pred_lgb = pred_lgb.groupby(['customer_id']).head(12)

pred_lgb.loc[:, ['customer_id', 'article_id']] = pred_lgb.loc[:, ['customer_id', 'article_id']].astype(str)

In [None]:
pred_lgb.customer_id.nunique()

In [None]:
pred_lgb.head(13)

In [None]:
pred_lgb.groupby(['customer_id'])['article_id'].apply(list).reset_index()