We want to share method We used to extract very usefull set of features. This notebook creates 400-500 features and in all our models at least 6-7 of them were in top 10 features in any model. We call it SequentialEncoder and it produces 1 feature per 1 raw feature. 
For example, `P_2_seq` feature much more informative than `P_2_last`.


The classic way of extracting features from a sequence (in our case sequence of length 13) is to use reduce functions such as mean, min, max, std, first, last etc and then feed them into a LGBM and in many cases it's good enough. But this method does not takes into consideration the time dynamics of the sequence.


One of the methods to extract it is LSTM. One can simply feed a univariate feature into the LSTM model and using k-Fold extract prediction for each sample. Than use those predictions as a feature in any tabular model. You do not have to use LSTM - there is plenty other architectures, such as many variations of 1D-CNN. Those models are great for extracting signal from time-series based models. 


Unfortunately, in our tests those models was not able to achieve decent performance, probably due to short length of the sequences. Instead of this, we chose to use tree-based version of that approach. We extracted many features from each sequence (602 features in total) which are averages of different parts of the sequence (different positions and different lengths), also we subtracted averages from other averages to catch the derivatives of the sequence. For example `np.mean(P_2[7:12]) - np.mean(P_2[3:5])`. Then we feed all those features into lightgbm with 5 folds and used OOF as a stand-alone feature.


We extracted those features for every raw feature and also for different interaction of features (example - `P_2-B_9`)

---

**Important** to use this features in addition to other features and not as substitution: This feature manage to capture lot of signal, but lose all cross-feature information.


I also want to apologize for not polishing the notebook. You cant run it here, so it doesn't make sence :) Just sharing ideas with the best community ever :)

This is not the full solution - only part. More to come... 

In [None]:
import gc
import warnings
import pathlib
import pandas as pd
import random
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import plotly.express as px

import joblib
import pathlib
import lightgbm as lgb
import gc
from functools import reduce
import re
import itertools
from tqdm import tqdm

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

dataset_name = 'final'
root_folder = pathlib.Path('choose_your_own_path')
input_folder = root_folder / 'input'
features_folder = root_folder / 'features' / dataset_name

features_folder.mkdir(parents=True, exist_ok=True)

In [None]:
class CFG:
    seed = 16
    n_folds = 5
    target = 'target'


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except Exception as e:
        print('torch was not seeded!')



In [None]:
def prepare_payment_rank(df):
    df.S_2 = pd.to_datetime(df.S_2)
    df['rank'] = df.groupby('customer_ID').S_2.rank()
    df['total_rows'] = df['customer_ID'].map(df.groupby('customer_ID')['rank'].max())
    df['rank'] = df['rank'] - df['total_rows'] + 13
    df['month'] = pd.to_datetime(df.S_2.dt.year.astype(str) + '-' + df.S_2.dt.month.astype(str) + '-01')
    df['cohort'] = df.customer_ID.map(df.groupby('customer_ID').month.max())
    month_dict = {cohort: df.loc[lambda dx: dx.cohort.eq(cohort)].loc[lambda dx: dx.total_rows.eq(13)].loc[:,
                          ['rank', 'month']].drop_duplicates().set_index('month').to_dict()['rank'] for cohort in
                  df.cohort.unique()}
    for cohort in df.cohort.unique():
        df.loc[lambda dx: dx.cohort.eq(cohort), 'rank'] = df.loc[lambda dx: dx.cohort.eq(cohort)].month.map(
            month_dict[cohort])

    df['first_rank'] = df.customer_ID.map(df.groupby(['customer_ID'])['rank'].min())
    df['rank2'] = df['rank'] - df['first_rank'] + 1
    df = df.set_index('customer_ID')
    df['len_period'] = df.groupby('customer_ID')['rank2'].max()
    return df.drop(columns=['month', 'first_rank'])
  
    
def numeric_feature_eng(df, num_features, funcs):
    df_num_agg = df.groupby("customer_ID")[num_features].agg(funcs)
    df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]
    return df_num_agg


def categorical_feature_eng(df, cat_features, funcs):
    df_cat_agg = df.groupby("customer_ID")[cat_features].agg(funcs)
    df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]
    return df_cat_agg


def pre_flattening(df):
    for bcol in ['B_11', 'B_14', 'B_17', 'D_39','D_131', 'S_16', 'S_23']:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]
    return df

def check_input(arr):
    """
    Check the input
    """
    if type(arr) is pd.DataFrame:
        arr = arr[arr.columns[0]]
        
    if type(arr) is pd.Series:
        arr = arr.values
        
    if len(arr.shape) > 1:
        arr = arr[:, 0]
        
    return arr


def gini(cs_0, cs_1, sum_0, sum_1):
    """
    Gini part
    """
    auc_ = (cs_0 - sum_0 / 2) * sum_1
    tot = cs_0[-1] * cs_1[-1]

    return 2 * float(auc_.sum() / tot) - 1


def recall_at4(cs_0, cs_1, sum_1):
    """
    Recall part
    """
    cs_tot = cs_0 + cs_1
    th = cs_tot[-1] * 0.96
    
    return float(sum_1[cs_tot >= th].sum() / cs_1[-1])
    
    
def _amex_score(y_true, y_pred):
    """
    Faster NumPy metric implementation
    """
    y_true = check_input(y_true)
    y_pred = check_input(y_pred)

    sum_1 = y_true[y_pred.argsort()]
    sum_0 = (1 - sum_1) 
    sum_0 *= 20
    
    cs_0, cs_1 = np.cumsum(sum_0, dtype=np.float64), np.cumsum(sum_1, dtype=np.float64)
    
    g = gini(cs_0, cs_1, sum_0, sum_1)
    d = recall_at4(cs_0, cs_1, sum_1)
    
    return (g + d) / 2
  
def amex_score(y_true, y_pred):
    return np.max([_amex_score(y_true, y_pred), _amex_score(y_true, -y_pred)])
  
    
def generate_features(data: pd.DataFrame, feature_name: str):
    dt = data.set_index('rank', append=True).loc[:, feature_name].unstack()
    metadata = dict()

    for idx, col in enumerate(dt.columns, start=1):
        metadata[col] = dict(min_ts=idx, max_ts=idx)
    df = []
    for e in range(1, 14):
        for s in range(1, e):
            if e - s < 3:
                continue
            name = f'avg({s}:{e})'
            df.append(dt.iloc[:, s:e].mean(1).to_frame(name))
            metadata[name] = dict(min_ts=s, max_ts=e)

    df = pd.concat(df, axis=1)
    df = df.join(dt)
    for col1, col2 in itertools.permutations(df.columns, 2):
        if metadata[col1]['min_ts'] <= metadata[col2]['max_ts']:
            continue
        df[f'sub({col1}, {col2})'] = df[col1] - df[col2]
        all_ts = [metadata[col1]['min_ts'], metadata[col1]['max_ts'], metadata[col2]['min_ts'],
                  metadata[col2]['max_ts']]

        metadata[f'sub({col1}, {col2})'] = dict(min_ts=np.min(all_ts), max_ts=np.max(all_ts))
    df = df.add_prefix(f'{feature_name}_')
    print(f'shape: {df.shape}')

    df['fold_id'] = df.reset_index().customer_ID.str[-16:].apply(lambda x: int(x, 16) % CFG.n_folds).values

    df = df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x))
    return df


In [None]:
seed_everything(CFG.seed)

In [None]:
class SequentialTransformer:
    def __init__(self, seed, n_folds, target, feature_name):
        self.models = []
        self.seed = seed
        self.n_folds = n_folds
        self.target = target
        self.feature_name = feature_name
        self.features = []

    def fit(self, X, y):
        df = X.join(y)
        self.features = [col for col in df.columns if col not in ['fold_id', 'target']]

        params = {
            'objective': 'binary',
            'metric': "binary_logloss",
            'seed': self.seed
        }

        print(f'numerical features ({len(self.features)}): ')
        for i in range(0, len(self.features), 10):
            print(f'\t{self.features[i:i + 10]}')
        oof = []
        for fold_id in range(self.n_folds):
            print(' ')
            print('-' * 50)
            print(f'Training fold {fold_id} with {len(self.features)} features...')

            x_trn = df.loc[lambda dx: dx.fold_id.ne(fold_id), self.features]
            x_val = df.loc[lambda dx: dx.fold_id.eq(fold_id), self.features]
            y_trn = df.loc[lambda dx: dx.fold_id.ne(fold_id), self.target]
            y_val = df.loc[lambda dx: dx.fold_id.eq(fold_id), self.target]

            lgb_train = lgb.Dataset(x_trn, y_trn)
            lgb_valid = lgb.Dataset(x_val, y_val)

            model = lgb.train(params=params,
                              train_set=lgb_train,
                              num_boost_round=100000,
                              valid_sets=[lgb_train, lgb_valid],
                              early_stopping_rounds=50,
                              verbose_eval=100
                              )
            val_pred = model.predict(x_val)
            score = amex_score(y_val, val_pred)
            print(f'Our fold {fold_id} CV score is {score}')
            oof.append(pd.DataFrame(val_pred, index=x_val.index, columns=[self.feature_name]))
            self.models.append(model)
        oof = pd.concat(oof)
        r = oof.join(y)
        print(amex_score(r.target, r[self.feature_name]))
        return self

    def predict(self, X):
        predictions = []
        for model in self.models:
            predictions.append(pd.DataFrame(model.predict(X.loc[:, self.features]),
                                            index=X.index,
                                            columns=[self.feature_name]))
        predictions = pd.concat(predictions).groupby(level=0).mean()
        return predictions

In [None]:
train = pd.read_parquet(input_folder / 'train.parquet').pipe(prepare_payment_rank)
train_labels = pd.read_csv(input_folder / 'train_labels.csv').set_index('customer_ID')

In [None]:
new_features = ['D_51-R_7', 'P_2-D_72', 'D_41-B_33', 'P_2-S_17', 'B_2-R_1', 'B_2-R_2', 'D_51-S_23', 'R_2-B_6', 'B_5-S_22', 'D_41-B_10', 'P_2-D_109',
                'P_2-B_22', 'P_2-R_17', 'R_5-D_115', 'P_2-D_111', 'P_2-D_41', 'P_2-S_16', 'P_2-B_28', 'P_2-R_8', 'D_44-D_47', 'D_51-D_65', 'P_2-R_10',
                'B_2-D_58', 'B_12-D_131', 'B_2-B_9', 'P_2-B_3', 'D_81-D_115', 'D_71-R_7', 'P_2-R_18', 'P_2-B_26', 'D_51-S_5', 'P_2-B_1', 'R_4-D_115',
                'R_6-B_33', 'D_45-R_4', 'P_2-S_23', 'P_2-D_93', 'B_6-B_9', 'S_23-P_3', 'B_25-D_112', 'D_41-D_45', 'P_2-R_7', 'P_2-D_87', 'P_2-B_31',
                'B_5-S_24', 'D_45-D_84', 'P_2-B_21', 'D_52-D_65', 'P_2-R_15', 'P_2-D_133', 'R_2-S_25', 'B_2-D_41', 'D_47-R_4', 'S_5-B_12', 'S_23-P_2',
                'R_2-D_52', 'D_51-B_26', 'P_2-B_9', 'S_16-P_2', 'D_78-B_33', 'D_45-R_5', 'D_51-R_6', 'D_71-R_8', 'D_45-B_22', 'S_16-P_3', 'R_2-D_71',
                'B_2-D_84', 'D_47-B_22', 'P_2-D_83', 'P_2-R_22', 'P_2-D_78', 'P_2-R_25', 'P_2-R_2', 'R_1-D_52', 'P_2-R_19', 'D_47-R_5', 'P_2-D_140',
                'P_2-B_32', 'B_2-D_44', 'D_51-D_79', 'B_8-D_112', 'P_2-S_20', 'P_2-D_138', 'B_12-S_24', 'D_71-D_81', 'R_2-B_10', 'D_45-D_72', 'D_52-R_5',
                'D_52-R_6', 'B_9-D_52', 'D_44-B_33', 'P_2-B_37', 'D_52-S_23', 'D_41-D_47', 'P_2-B_27', 'R_2-D_47', 'B_17-P_2', 'P_2-R_24', 'D_51-R_5',
                'P_2-R_5', 'D_47-D_78', 'D_39-P_3', 'B_5-B_9', 'B_9-B_12', 'B_14-P_2', 'D_71-D_72', 'D_51-R_10', 'D_51-D_84', 'B_17-P_3', 'D_45-D_65',  
                'D_131-P_2', 'R_1-D_47',  'D_71-D_84', 'D_45-B_9', 'D_45-R_2', 'D_52-B_22',  'D_52-D_84', 'D_84-B_33', 'D_41-B_18', 'D_51-B_14', 'B_9-D_54', 
                'S_5-S_13', 'R_5-D_71','B_2-S_23', 'B_9-B_10', 'D_39-P_2',  'D_45-R_10', 'D_51-S_24', 'B_22-D_112',   'D_51-D_81', 'R_8-D_115', 'B_12-D_79', 
                'R_1-B_6',  'D_51-R_4',  'B_9-B_18', 'B_12-B_14',  'R_4-D_71', 'D_45-R_7',  'S_8-S_22', 'B_33-S_23', 'B_11-P_3', 'S_13-S_24',   'R_2-D_51', 
                'R_1-B_33','S_8-S_24', 'R_2-B_18',  'D_51-D_72',   'D_52-B_28', 'D_51-R_26', 'D_47-D_65', 'R_2-B_33', 'D_72-D_115',  'R_1-B_18', 'R_8-D_121',  
                'D_51-B_9', 'B_11-P_2', 'D_131-P_3', 'D_41-D_52', 'D_52-D_78', 'R_1-D_45', 'D_47-S_23', 'D_45-R_6',  'D_47-R_6', 'D_47-D_84', 'D_45-D_78', 
                'D_47-B_9', 'R_1-B_10', 'D_71-R_10', 'B_9-B_33', 'D_47-R_10',   'D_45-S_23', 'D_45-D_81', 'B_14-P_3', 'D_45-B_26',  'P_2-B_11']

In [None]:
def create_new_seq_set(df):
    new_df = [df.loc[:, ['rank']]]
    for col in new_features:
        print(col)
        c1, c2 = col.split('-')
        new_df.append((df[c1] - df[c2]).to_frame(col))
    new_df = pd.concat(new_df, axis=1)
    print(new_df.shape)
    return new_df

In [None]:
train = create_new_seq_set(train).join(train_labels)

In [None]:
for idx, feature_name in enumerate(train.columns):
    try:
        print(f'starting {feature_name} {idx}/{len(train.columns)}')
        model_name = feature_name
        model_path = root_folder / 'models' / 'transactional' / model_name / 'model.pkl'
        if model_path.exists():
            continue

        df = generate_features(data=train, feature_name=feature_name)
        df = df.join(train_labels)
        X = df.drop(columns=[CFG.target])
        y = df.loc[:, CFG.target]

        model = SequentialTransformer(seed=42, n_folds=CFG.n_folds, target=CFG.target, feature_name=feature_name)
        model.fit(X, y)


        models_folder = root_folder / 'models' / 'transactional' / model_name
        oof_folder = root_folder / 'oof' / 'transactional' / model_name
        predictions_folder = root_folder / 'predictions' / 'transactional' / model_name

        models_folder.mkdir(parents=True, exist_ok=True)
        oof_folder.mkdir(parents=True, exist_ok=True)
        predictions_folder.mkdir(parents=True, exist_ok=True)
        joblib.dump(model, model_path)
        gc.collect()
    except Exception as e:
        print(f'failed to process {feature_name}')
        print(e)
        pass

In [None]:
for idx, feature_name in enumerate(train.columns):
    try:
        print(f'starting {feature_name} {idx}/{len(train.columns)}')
        model_name = feature_name

        models_folder = root_folder / 'models' / 'transactional' / model_name
        oof_folder = root_folder / 'oof' / 'transactional' / model_name

        if (oof_folder / 'oof.parquet').exists():
            continue
        model = joblib.load(models_folder / 'model.pkl')
        X = generate_features(data=train, feature_name=feature_name)


        oof = []
        for fold_id, m in enumerate(model.models):
            x_val = X.loc[lambda dx: dx.fold_id.eq(fold_id), model.features]
            val_pred = m.predict(x_val)
            oof.append(pd.DataFrame(val_pred, index=x_val.index, columns=[feature_name]))
        oof = pd.concat(oof)
        oof.to_parquet(oof_folder / 'oof.parquet')
    except Exception as e:
        print(f'failed to predict {feature_name}')
        print(e)

In [None]:
test = (pd.concat([pd.read_parquet(input_folder / f'test_chunk_{chunk_id}.parquet') 
                   for chunk_id in list('0123456789abcdef')])
        .pipe(prepare_payment_rank)
       )

test = create_new_seq_set(test)

In [None]:
for idx, feature_name in enumerate(test.columns[1:]):
    break
print(f'starting {feature_name} {idx}/{len(test.columns)}')
model_name = feature_name

models_folder = root_folder / 'models' / 'transactional' / model_name
oof_folder = root_folder / 'oof' / 'transactional' / model_name
predictions_folder = root_folder / 'predictions' / 'transactional' / model_name

models_folder.mkdir(parents=True, exist_ok=True)
oof_folder.mkdir(parents=True, exist_ok=True)
predictions_folder.mkdir(parents=True, exist_ok=True)

In [None]:
for idx, feature_name in enumerate(test.columns):
    print(f'starting {feature_name} {idx}/{len(test.columns)}')
    try:
        model_name = feature_name
        models_folder = root_folder / 'models' / 'transactional' / model_name
        oof_folder = root_folder / 'oof' / 'transactional' / model_name
        predictions_folder = root_folder / 'predictions' / 'transactional' / model_name
        savepath = predictions_folder / 'predictions.parquet'
        if savepath.exists():
            continue
        models_folder.mkdir(parents=True, exist_ok=True)
        oof_folder.mkdir(parents=True, exist_ok=True)
        predictions_folder.mkdir(parents=True, exist_ok=True)
        model = joblib.load(models_folder / 'model.pkl')
        x_tst = generate_features(data=test, feature_name=feature_name)
        predictions = pd.DataFrame(model.predict(x_tst), index=x_tst.index, columns=[feature_name])
        predictions.reset_index().to_parquet(savepath)
    except Exception as e:
        print(f'failed to predict {feature_name}')
        print(e)

In [None]:
oof_folder = root_folder / 'oof' / 'transactional'
oof = [pd.read_parquet(p) for p in tqdm(oof_folder.rglob('*.parquet'))]
oof = pd.concat(oof, axis=1).add_suffix('_seq')
oof.to_parquet(features_folder / 'train_seq.parquet')

# ##

predictions_folder = root_folder / 'predictions' / 'transactional'
predictions = [pd.read_parquet(p).set_index('customer_ID') for p in tqdm(predictions_folder.rglob('*.parquet'))]
predictions = pd.concat(predictions, axis=1).add_suffix('_seq')
predictions.to_parquet(features_folder / 'test_seq.parquet')