In [1]:
from datetime import datetime 

start_time = datetime.now() 

In [2]:
N_JOBS = N_BATCHES = 32

In [3]:
from datetime import date
from collections import defaultdict
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import joblib



In [4]:
%%time

train_target = pd.read_csv('data/alfabattle2_abattle_train_target.csv', parse_dates=['timestamp'])
train_target = train_target.sort_values('timestamp')
train_target.shape

CPU times: user 11.3 s, sys: 1.34 s, total: 12.6 s
Wall time: 12.6 s


In [5]:
train_target.shape

(5065350, 4)

In [6]:
%%time

test_session = pd.read_csv('data/alfabattle2_prediction_session_timestamp.csv', parse_dates=['timestamp'])
test_session = test_session.sort_values('timestamp')

CPU times: user 82 ms, sys: 8.62 ms, total: 90.6 ms
Wall time: 87 ms


In [7]:
test_session.shape

(79268, 2)

# Partitioning for parallel processing

In [8]:
def client_sample(data, clients, batch_i, n_jobs, column='client_pin'):
    clients_batch = clients[batch_i::n_jobs]
    return data[data[column].isin(clients_batch)]

In [9]:
!mkdir data/partitioned

mkdir: невозможно создать каталог «data/partitioned»: Файл существует


In [10]:
clients = sorted(train_target.client_pin.unique())
for partition_i in tqdm(range(N_BATCHES)):
    train_sample = client_sample(train_target, clients, partition_i, N_BATCHES)
    test_sample = client_sample(test_session, clients, partition_i, N_BATCHES)
    joblib.dump((train_sample, test_sample), f'data/partitioned/no_parquet_{partition_i}.dump')

    del train_sample
    del test_sample    

del train_target
del test_session

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




# Feature generation 

In [11]:
def add_timestamp_columns(data):
    data['date'] = data['timestamp'].apply(lambda x: x.date())
    data['weekday'] = data['date'].apply(lambda x: x.weekday())
    data['hour'] = data['timestamp'].apply(lambda x: x.hour)
    data['date_i'] = data['date'].apply(lambda x: (x - date(2020, 10, 1)).days)
    
    return data

In [12]:
def load_data(partition_i):
    train, test = joblib.load(f'data/partitioned/no_parquet_{partition_i}.dump')
    return train, test


def get_last_i(s, i):
    if s.shape[0] > i:
        return s.iloc[-i-1]
    else:
        return None
    
    
def train_features(client):
    """This is used per client"""
    features = {
        'count': client.shape[0],
    }
    
    for c, tail_size in [
        ('multi_class_target', 20),
        ('weekday', 1),
        ('date_i', 3),
    ]:
        features.update(dict([(
            f'last__{c}__{i}', get_last_i(client[c], i),
        ) for i in range(tail_size)]))
        
    # Histogram over all sessions
    for c in [
        'hour',
        'weekday',
        'multi_class_target',
    ]:
        frequencies = client[c].value_counts(normalize=True).to_dict()
        features.update(dict([
            (f'{c}__{k}', v)
            for k, v in frequencies.items()
        ]))
        
    # Histogram over `tail_size` last sessions
    for tail_size in [
        10, 20, 40
    ]:
        for c in [
            'multi_class_target',
        ]:
            frequencies = client[c].tail(tail_size).value_counts(normalize=False).to_dict()
            features.update(dict([
                (f'tail{tail_size}__{c}__{k}', v)
                for k, v in frequencies.items()
            ]))
    
    # Histogram over `day_tail_size` last *days* in client history
    for day_tail_size in [
        1, 7, 14, 28
    ]:
        for c in [
            'multi_class_target',
        ]:
            last_date = get_last_i(client['date_i'], 0)
            tail = client[client.date_i > (last_date - day_tail_size)][c]
            frequencies = tail.value_counts(normalize=True).to_dict()
            features.update(dict([
                (f'day_tail{day_tail_size}__{c}__{k}', v)
                for k, v in frequencies.items()
            ]))
            
            features.update(dict([
                (f'day_tail{day_tail_size}__count', tail.shape[0])
            ]))

    return pd.Series(dict(features))

In [13]:
def generate_features(batch_i, client_session_index):
    """`client_session_index` - is a index of session to predict (starting from last in train set)
    client_session_index == -1 is used for test session
    """
    train, test = load_data(batch_i)

    train = add_timestamp_columns(train)
    test = add_timestamp_columns(test)
    
    # Needs every client sessions to be sorted ascending
    train['client_session_index'] = train.groupby('client_pin').cumcount(ascending=False)

    # Only client sessions before prediction used for features
    
    # This information is from prei
    features_prediction_moment = ['weekday', 'hour', 'date_i']
    if client_session_index >= 0:
        prediction_moment = train[train['client_session_index'] == client_session_index][
            ['client_pin', 'multi_class_target'] + features_prediction_moment
        ]        
    elif client_session_index == -1:
        prediction_moment = test[['client_pin'] + features_prediction_moment]
    else:
        assert client_session_index >= -1
    train_mask = train['client_session_index'] > client_session_index
    train = train[train_mask]

    features = pd.DataFrame(
        train.groupby('client_pin').apply(train_features)
    )
    
    features = features.reset_index()
    
    # This needs because of groupby apply aggregation magic
    features = features.pivot(index='client_pin', columns='level_1', values=0)
    
    features = features.merge(prediction_moment, on='client_pin')
    features['client_session_index'] = client_session_index
    features['session_delta_date'] = features['date_i'] - features['last__date_i__0']
    return features


def no_parquet_features_joblib(n_jobs, index):
    features_all = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(generate_features)(batch_i, index) 
        for batch_i in range(N_BATCHES)
    )
    
    return pd.concat(features_all, axis=0)

In [14]:
%%time

test_features = no_parquet_features_joblib(n_jobs=N_JOBS, index=-1)

CPU times: user 1.46 s, sys: 869 ms, total: 2.32 s
Wall time: 39 s


In [15]:
test_features.shape

(79268, 146)

In [16]:
len(test_features.columns)

146

In [17]:
test_features.columns

Index(['client_pin', 'count', 'day_tail14__count',
       'day_tail14__multi_class_target__card2card_transfer',
       'day_tail14__multi_class_target__card_recharge',
       'day_tail14__multi_class_target__chat',
       'day_tail14__multi_class_target__credit_info',
       'day_tail14__multi_class_target__invest',
       'day_tail14__multi_class_target__main_screen',
       'day_tail14__multi_class_target__mobile_recharge',
       ...
       'weekday__2', 'weekday__3', 'weekday__4', 'weekday__5', 'weekday__6',
       'weekday', 'hour', 'date_i', 'client_session_index',
       'session_delta_date'],
      dtype='object', length=146)

In [18]:
sessions_count = 6

In [19]:
Xy_all = []
for i in tqdm(range(sessions_count)):
    Xy = no_parquet_features_joblib(n_jobs=N_JOBS, index=i)
    Xy_all.append(Xy)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [20]:
# In histograms could be some rare values
# They will result in features that are not present in all feature sets, we exclude them 
common_features = set(test_features.columns).intersection(
    *[set(features_.columns) for features_ in Xy_all]
)
len(common_features)

146

In [21]:
common_features

{'client_pin',
 'client_session_index',
 'count',
 'date_i',
 'day_tail14__count',
 'day_tail14__multi_class_target__card2card_transfer',
 'day_tail14__multi_class_target__card_recharge',
 'day_tail14__multi_class_target__chat',
 'day_tail14__multi_class_target__credit_info',
 'day_tail14__multi_class_target__invest',
 'day_tail14__multi_class_target__main_screen',
 'day_tail14__multi_class_target__mobile_recharge',
 'day_tail14__multi_class_target__own_transfer',
 'day_tail14__multi_class_target__phone_money_transfer',
 'day_tail14__multi_class_target__statement',
 'day_tail1__count',
 'day_tail1__multi_class_target__card2card_transfer',
 'day_tail1__multi_class_target__card_recharge',
 'day_tail1__multi_class_target__chat',
 'day_tail1__multi_class_target__credit_info',
 'day_tail1__multi_class_target__invest',
 'day_tail1__multi_class_target__main_screen',
 'day_tail1__multi_class_target__mobile_recharge',
 'day_tail1__multi_class_target__own_transfer',
 'day_tail1__multi_class_targ

# Dataset preparation

In [22]:
Xy = []
for Xy_i in Xy_all:
    Xy_i = Xy_i[[
        c for c in Xy_i.columns 
        if c in common_features or c in ['multi_class_target', 'client_session_index']
    ]]
    Xy.append(Xy_i)

Xy = pd.concat(Xy, axis=0, ignore_index=True)

In [23]:
Xy.shape

(442071, 147)

In [24]:
target_column = 'multi_class_target'
features = [
    f for f in Xy.columns 
    if f not in ['multi_class_target', 'client_session_index', 'client_pin', 'date_i']
]
categorical = [
    f for f in features 
    if f.startswith('last_')
    and not f.startswith('last__date_i__')
    and not f.startswith('last__hour__')
    and not f.startswith('last__weekday__')
]
categorical, features

(['last__multi_class_target__0',
  'last__multi_class_target__1',
  'last__multi_class_target__10',
  'last__multi_class_target__11',
  'last__multi_class_target__12',
  'last__multi_class_target__13',
  'last__multi_class_target__14',
  'last__multi_class_target__15',
  'last__multi_class_target__16',
  'last__multi_class_target__17',
  'last__multi_class_target__18',
  'last__multi_class_target__19',
  'last__multi_class_target__2',
  'last__multi_class_target__3',
  'last__multi_class_target__4',
  'last__multi_class_target__5',
  'last__multi_class_target__6',
  'last__multi_class_target__7',
  'last__multi_class_target__8',
  'last__multi_class_target__9'],
 ['count',
  'day_tail14__count',
  'day_tail14__multi_class_target__card2card_transfer',
  'day_tail14__multi_class_target__card_recharge',
  'day_tail14__multi_class_target__chat',
  'day_tail14__multi_class_target__credit_info',
  'day_tail14__multi_class_target__invest',
  'day_tail14__multi_class_target__main_screen',
  'd

In [25]:
train_mask = Xy['client_session_index'] > 0
X_train = Xy[train_mask][features].fillna(0)
y_train = Xy[train_mask][target_column]
X_eval = Xy[~train_mask][features].fillna(0)
y_eval = Xy[~train_mask][target_column]

# Building model

In [26]:
from catboost import CatBoostClassifier

params_binary = {
    'eval_metric': 'Logloss', 
    'learning_rate': 0.1,
    'iterations': 200,
    'random_state': 0,
    'max_ctr_complexity': 2,
}

In [27]:
unique_targets = sorted(y_train.value_counts().index)
unique_targets

['card2card_transfer',
 'card_recharge',
 'chat',
 'credit_info',
 'invest',
 'main_screen',
 'mobile_recharge',
 'own_transfer',
 'phone_money_transfer',
 'statement']

In [28]:
def train_models_separate(X_train, y_train, X_eval, y_eval, unique_targets):
    models = {}
    
    for t in tqdm(unique_targets):
        features_subset = list(X_train.columns)

        y_train_binary = y_train == t
        y_eval_binary = y_eval == t

        model = CatBoostClassifier(thread_count=N_JOBS, **params_binary)
        model.fit(
            X_train[features_subset], 
            y_train_binary,
            eval_set=(X_eval[features_subset], y_eval_binary),
            cat_features=categorical,
            verbose=100,
        )
        models[t] = model
        
    return models


def predict_separate(models, X, unique_targets):
    predictions = []
    for t in unique_targets:
        model = models[t]
        p = model.predict_proba(X)[:, 1]
        predictions.append(p)
        
    predictions = np.column_stack(predictions)
    
    return predictions


In [29]:
models = train_models_separate(X_train, y_train, X_eval, y_eval, unique_targets)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

0:	learn: 0.5315007	test: 0.5307566	best: 0.5307566 (0)	total: 232ms	remaining: 46.1s
100:	learn: 0.1144550	test: 0.1126501	best: 0.1126501 (100)	total: 15.3s	remaining: 15s
199:	learn: 0.1123128	test: 0.1125189	best: 0.1125123 (198)	total: 29.5s	remaining: 0us

bestTest = 0.1125123067
bestIteration = 198

Shrink model to first 199 iterations.
0:	learn: 0.5261377	test: 0.5267078	best: 0.5267078 (0)	total: 175ms	remaining: 34.9s
100:	learn: 0.1103231	test: 0.1159296	best: 0.1159296 (100)	total: 15.5s	remaining: 15.2s
199:	learn: 0.1081999	test: 0.1156271	best: 0.1156257 (196)	total: 29.6s	remaining: 0us

bestTest = 0.1156256693
bestIteration = 196

Shrink model to first 197 iterations.
0:	learn: 0.5558937	test: 0.5566408	best: 0.5566408 (0)	total: 167ms	remaining: 33.2s
100:	learn: 0.1621855	test: 0.1676909	best: 0.1676772 (97)	total: 15.3s	remaining: 15s
199:	learn: 0.1600412	test: 0.1675070	best: 0.1674456 (173)	total: 29.8s	remaining: 0us

bestTest = 0.1674456317
bestIteration = 173


In [30]:
predictions_eval_separate = predict_separate(models, X_eval, unique_targets)
predictions_eval_separate.shape

(77631, 10)

In [31]:
def separate_models_fi(models):
    d = defaultdict(float)
    for model in models.values():
        for score, f in zip(model.get_feature_importance(), model.feature_names_):
            d[f] += score
            
    for score, f in sorted(d.items(), reverse=True, key=lambda x: x[1])[:50]:
        print(f'\t{f}: {score}')
    
separate_models_fi(models)

	51.25458569057098: session_delta_date
	39.47054587887865: count
	39.017878351430255: multi_class_target__own_transfer
	38.70406800870731: multi_class_target__credit_info
	36.09245603673381: multi_class_target__mobile_recharge
	26.738520956895485: multi_class_target__main_screen
	23.666983966564587: multi_class_target__card_recharge
	22.742532306492233: multi_class_target__card2card_transfer
	22.300502194466542: multi_class_target__phone_money_transfer
	22.162914743068782: multi_class_target__statement
	15.709608333068262: tail40__multi_class_target__mobile_recharge
	15.579705340362121: tail10__multi_class_target__main_screen
	14.905400615498145: day_tail1__multi_class_target__main_screen
	14.870408176347116: hour
	14.622719024827548: last__multi_class_target__0
	13.865865381047987: tail40__multi_class_target__card2card_transfer
	13.731302996789811: day_tail28__multi_class_target__credit_info
	13.398898913877648: day_tail1__multi_class_target__chat
	13.223416213118012: tail40__multi_cl

# Prediction post-processing

In [32]:
from sklearn.metrics import f1_score

def get_class(predictions, weights):
    return np.array(unique_targets)[(predictions * np.array([weights])).argmax(axis=1)]


def calculate_f1_score(predictions, y_eval, weights):
    prediction_class = get_class(predictions, weights)
    total_f1 = []
    for t in unique_targets:
        y_eval_binary = y_eval == t
        p_binary = prediction_class == t
        f1 = f1_score(y_eval_binary, p_binary)
        total_f1.append(f1)
        
    return (np.mean(total_f1), weights)


def find_best_weights(predictions, y_eval):
    best_weights = np.array([1.0] * len(unique_targets))
    best_f1 = 0
    for i in tqdm(range(100)):
        options = []
        for t_index in range(10):
            weights = best_weights.copy()
            weights[t_index] += 0.2
            options.append(weights)
        
        alternatives = joblib.Parallel(n_jobs=len(options))(
            joblib.delayed(calculate_f1_score)                                                
            (predictions, y_eval, weights)
            for weights in options
        )
    
        f1, best_weights = max(alternatives)
        if f1 <= best_f1:
            break
        best_f1 = f1
        
    return best_weights, f1

In [33]:
best_weights_separate, f1 = find_best_weights(predictions_eval_separate, y_eval)
f1

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




0.42081715963307154

In [34]:
list(zip(unique_targets, best_weights_separate))

[('card2card_transfer', 2.4),
 ('card_recharge', 1.7999999999999998),
 ('chat', 2.6),
 ('credit_info', 1.2),
 ('invest', 1.2),
 ('main_screen', 1.0),
 ('mobile_recharge', 2.8000000000000003),
 ('own_transfer', 1.5999999999999999),
 ('phone_money_transfer', 2.1999999999999997),
 ('statement', 1.5999999999999999)]

In [35]:
prediction_class = get_class(predictions_eval_separate, best_weights_separate)
total_f1 = []
for t in unique_targets:
    y_eval_binary = y_eval == t
    p_binary = prediction_class == t
    f1 = f1_score(y_eval_binary, p_binary)
    print(f'{f1:.3f}\t{t}')
    total_f1.append(f1)
    
print(f'{np.mean(total_f1):.3f}\tf1_total')

0.304	card2card_transfer
0.396	card_recharge
0.314	chat
0.599	credit_info
0.390	invest
0.679	main_screen
0.335	mobile_recharge
0.430	own_transfer
0.315	phone_money_transfer
0.446	statement
0.421	f1_total


# Submission generation

In [36]:
%%time

predictions_test_separate = predict_separate(models, test_features[features].fillna(0), unique_targets)

CPU times: user 16.9 s, sys: 3.42 s, total: 20.4 s
Wall time: 7.62 s


In [37]:
def make_submit(predictions, test_features, best_weights, target_column):
    submission = test_features.copy()
    submission[target_column] = get_class(predictions, best_weights)
    submission = submission.reset_index()[['client_pin', target_column]]
    print(submission[target_column].value_counts())
    
    return submission

In [38]:
submission_separate = make_submit(predictions_test_separate, test_features, best_weights_separate, target_column)

main_screen             35077
statement               12365
credit_info             11701
mobile_recharge          3891
phone_money_transfer     3626
chat                     3534
card_recharge            3377
own_transfer             3126
card2card_transfer       2219
invest                    352
Name: multi_class_target, dtype: int64


In [39]:
submission_separate.to_csv(
    'submissions/submission_no_parquet.csv.gz', 
    compression='infer',
    index=False,
)

In [40]:
print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))

Time elapsed (hh:mm:ss.ms) 0:11:35.044488
