In [None]:
import os
import sys

PROJECT_ROOT = os.path.join(os.path.abspath(os.path.dirname('./../../')), 'src')

sys.path.append(PROJECT_ROOT)

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

In [2]:
from data_utils import *
from config import Config
from feature_extraction_utils import GlobalClientFeaturesExtractor
from matplotlib import pyplot as plt

**Load data**

In [3]:
%%time
cfg = Config()
ds = DataSource(cfg['data_source'])
ds.read_data()
ds.validation_split()

Reading groups_desc...
Reading train_x...
Reading test_x...
Reading train_target...
Reading test_target...
Validation split: by clientID
CPU times: user 27.3 s, sys: 4.65 s, total: 31.9 s
Wall time: 21 s


In [4]:
cfg['data_source'].keys()

dict_keys(['groups_desc', 'train_x', 'test_x', 'train_target', 'test_target'])

In [5]:
print(ds)

Dataset: groups_desc, shape: (204, 2)
Dataset: train_x, shape: (21295142, 4)
Dataset: test_x, shape: (17667328, 4)
Dataset: train_target, shape: (24145, 2)
Dataset: test_target, shape: (20000, 1)
Dataset: valid_x, shape: (5155435, 4)
Dataset: valid_target, shape: (5855, 2)


**Groups descriptions**

In [6]:
# groups = ds.get_data('groups_desc')
# groups

**Train target**

In [7]:
# train_target = ds.get_data('train_target')
# print(train_target.shape)
# train_target.head()

Train clientID distribution

In [8]:
#train_target.client_id.hist(bins=1000);

In [9]:
#train_target.bins.hist();

**Train transactions**

In [10]:
train_x = ds.get_data('train_x')
train_x.shape

(21295142, 4)

In [11]:
#train_x.head()

In [12]:
#train_x.groupby('client_id').trans_date.nunique().hist(bins=100);

In [13]:
#train_x.groupby('client_id').trans_date.max().max()#.hist(bins=100);

In [14]:
#train_x.groupby('client_id').trans_date.min().min()#.hist(bins=100);

In [15]:
#train_x.groupby('client_id').small_group.nunique().hist(bins=100);

In [16]:
#train_x.groupby('client_id').amount_rur.max().hist(bins=100);

Number of clients which spent huge amounts of money in a single transaction

In [17]:
#train_x[train_x.amount_rur > 30000].client_id.nunique()

Some groups are popular, some are not

In [18]:
#train_x.groupby('small_group').client_id.nunique().hist(bins=100)

# Features playground

In [19]:
train_x.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,3694,26,1,10.006
1,3694,29,3,54.955
2,3694,31,1,10.945
3,3694,34,36,61.721
4,3694,36,25,4.579


In [32]:
from functools import partial
import pandas as pd
import numpy as np
import tqdm

PERCENTILES = (10, 33, 66, 90, 95)
HIGH_LEVEL_STATS = (np.min, np.max, np.std, np.mean, np.median, *[partial(np.percentile, q=p) for p in PERCENTILES])

def convert_dates(dates, output=None):
    if output == 'day_of_week':
        return dates % 7
    if output == 'month':
        return (dates // 30) % 12
    if output == 'quarter':
        return (dates // 90) % 4
    return dates

class GlobalClientFeaturesExtractor:
    AGG_LEVELS = (('month', 30), ('quarter', 90))
    def __init__(self, cfg=None):
        self._cfg = cfg
    
    def extract(self, df, client_id):
        transactions = df.query(f'client_id')
        n = transactions.shape[0]
        features = []
        
        ############################################
        # Dates ####################################
        ############################################
        print('Dates features extraction...')
        t_dates = transactions.trans_date
        
        unique_dates = t_dates.nunique()
        period_length = t_dates.max() - t_dates.min() + 1
        
        # fraction of transactions for a week day
        dow_dates = convert_dates(t_dates, 'day_of_week')
        week_day_fracs = np.array([(dow_dates == day_of_week).sum() / n for day_of_week in range(7)])
        # fraction of transactions for a month
        m_dates = convert_dates(t_dates, 'month')
        month_fracs = np.array([(m_dates == month).sum() / n for month in range(12)])
        # fraction of transactions for a quarter
        q_dates = convert_dates(t_dates, 'quarter')
        quarter_fracs = np.array([(q_dates == quarter).sum() / n for quarter in range(4)])
        transactions_per_day_info = transactions.groupby('trans_date').trans_date.count().describe()
        
        features.extend([
            period_length,
            unique_dates / period_length, # prob of [>=1] transactions for a given day
            *list(week_day_fracs), week_day_fracs.mean(), week_day_fracs.std(), week_day_fracs.min(), week_day_fracs.max(),
            *list(month_fracs), month_fracs.mean(), month_fracs.std(), month_fracs.min(), month_fracs.max(),
            *list(quarter_fracs), quarter_fracs.mean(), quarter_fracs.std(), quarter_fracs.min(), quarter_fracs.max(),
            *list(transactions_per_day_info)
        ])
        
        ############################################
        # Money ####################################
        ############################################
        print('Money features extraction...')
        money = transactions.amount_rur
        
        week_day_money_stats = np.array([list(money[dow_dates == day_of_week].describe()) for day_of_week in range(7)]).ravel()
        month_money_stats = np.array([list(money[m_dates == month].describe()) for month in range(12)]).ravel()
        quarter_money_stats = np.array([list(money[q_dates == quarter].describe()) for quarter in range(4)]).ravel()
        
        agg_features = []
        for name, duration in tqdm.tqdm_notebook(self.AGG_LEVELS):
            print(f'Money features, agg level: {name}')
            buckets = transactions.assign(bucket_id = transactions.trans_date // duration)
            for bucker_id, bucket_df in buckets.groupby('bucket_id'):
                bucket_money = bucket_df.amount_rur.values
                for _ in tqdm.tqdm_notebook(HIGH_LEVEL_STATS):
                    agg_features.append(_(bucket_money))
        
        features.extend([
            *list(money.describe()), *list(week_day_money_stats), *list(month_money_stats), *list(quarter_money_stats),
            *agg_features,
            # TODO(dzmr): Add more aggregations
        ])    
        
        # TODO(dzmr): Outliers
        # TODO(dzmr): Groups
        # TODO(dzmr): Clients clustering
        
        return features

In [33]:
global_extractor = GlobalClientFeaturesExtractor(None)
features = global_extractor.extract(train_x, 3694)

Dates features extraction...
Money features extraction...


HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

Money features, agg level: month


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Money features, agg level: quarter


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [35]:
print(len(features))
features

577


[730,
 1.0,
 0.1294300831616901,
 0.14029354676291897,
 0.1399529996090188,
 0.14317617604991786,
 0.1477848797627177,
 0.15297808298249432,
 0.1463842316712422,
 0.14285714285714285,
 0.006913239500277161,
 0.1294300831616901,
 0.15297808298249432,
 0.09243267783797826,
 0.07681728536959274,
 0.0787579627315939,
 0.08066952547205368,
 0.08331482363442329,
 0.0806593353545142,
 0.08027525714550295,
 0.07709495433277692,
 0.08714508689352717,
 0.08142082358502235,
 0.08721890654685467,
 0.09419336109615986,
 0.08333333333333333,
 0.0054767059078878935,
 0.07681728536959274,
 0.09419336109615986,
 0.2480079259391649,
 0.24464368446099116,
 0.24451529837180705,
 0.2628330912280369,
 0.25,
 0.007540368983589252,
 0.24451529837180705,
 0.2628330912280369,
 730.0,
 29171.427397260275,
 6871.238399258201,
 8900.0,
 24416.5,
 28865.5,
 33917.0,
 50387.0,
 21295142.0,
 59.33464234673788,
 169.0539297447481,
 0.043,
 11.040999999999999,
 23.791,
 54.038000000000004,
 6508.428000000001,
 2756232.