In [1]:
import pandas as pd
import numpy as np
import plotly
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import joblib

In [2]:
df = pd.read_csv(Path('../data/train.csv'))

In [3]:
df = df.sort_values(['id', 'time'])

In [4]:
samples = df['id'].unique()
WINDOW_SIZE = 25
HALF_WINDOW = int((WINDOW_SIZE-1)/2)
QART_WINDOW = int((WINDOW_SIZE-1)/4)
LOW_BOUND = 380
UPPER_BOUND = 1400

In [5]:
def pop_arr(arr, num):
    s = len(arr)
    return np.resize(np.repeat(np.resize(arr[:s], (1,s)), num, axis=0), (num*s,))

In [6]:
def timedomain(rr, mask):
    results = {}
    
    rr = rr[mask==1]
    
    if len(rr) > 1:
        hr = 60000/rr
    
        results['mean_RR'] = np.mean(rr)
        results['std_rr_sdnn'] = np.std(rr)
        results['mean_hr_kubious'] = 60000/np.mean(rr)
        results['mean_hr'] = np.mean(hr)
        results['std_hr'] = np.std(hr)
        results['min_hr'] = np.min(hr)
        results['max_hr'] = np.max(hr)
        results['rmssd'] = np.sqrt(np.mean(np.square(np.diff(rr))))
        results['nn_xx'] = np.sum(np.abs(np.diff(rr)) > 50)*1
        results['pnn_xx'] = 100 * np.sum((np.abs(np.diff(rr)) > 50)*1) / len(rr)
    else:
        results['mean_RR'] = 0
        results['std_rr_sdnn'] = 0
        results['mean_hr_kubious'] = 0
        results['mean_hr'] = 0
        results['std_hr'] = 0
        results['min_hr'] = 0
        results['max_hr'] = 0
        results['rmssd'] = 0
        results['nn_xx'] = 0
        results['pnn_xx'] = 0

    return results

In [7]:
new_df = []
for sample in tqdm(samples):
    # get array
    loc_df = df[df.id==sample].sort_values('time').reset_index(drop=True)
    arr = loc_df['x'].values
    
    target = loc_df['y'].values
    
    time = loc_df['time'].values
    
    mask = ((arr>LOW_BOUND) & (arr < UPPER_BOUND)).astype(int)
    # ext array with 8 points to each direction
    
    arr = np.concatenate((
        pop_arr(arr[:2], QART_WINDOW),
        arr,
        pop_arr(arr[-2:], QART_WINDOW),
    ))
    
    mask = np.concatenate((
        pop_arr([0], HALF_WINDOW),
        mask,
        pop_arr([0], HALF_WINDOW)
    ))
    for i in range(HALF_WINDOW+1, len(arr)-HALF_WINDOW+1):
        local_window = arr[i-HALF_WINDOW-1:i+HALF_WINDOW+2]
        local_mask = mask[i-HALF_WINDOW-1:i+HALF_WINDOW+2]
        new_df.append({'id': sample, 
                       'ts': time[i-HALF_WINDOW-1],
                       **{f"x_{x}":y for x,y in zip(range(-8,9), local_window)},
                      'target': target[i-HALF_WINDOW-1],
                       **{f"mask_{x}":y for x,y in zip(range(-8,9), local_mask)},
                       **timedomain(local_window, local_mask),
                       **{f"delta_{x}":y for x,y in zip(range(WINDOW_SIZE-1), np.diff(local_window))},
                      })


  0%|          | 0/229 [00:00<?, ?it/s]

In [8]:
new_df=pd.DataFrame(new_df)

In [9]:
new_df.shape

(60487, 71)

In [10]:
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score

def threshold_search(y_true, y_proba):
    precision , recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1 / (precision + 1e-18) + 1 / (recall + 1e-18))
    F[F>1.0] = 0
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    return best_th , best_score

In [11]:
lgbm_params = {
    'class_weight': 'balanced',
    'colsample_bytree': 0.7601761436423559,
    'learning_rate': 0.0032229857347791584,
    'min_child_samples': 35,
    'min_split_gain': 0.23401305259347366,
    'num_leaves': 78,
    'reg_alpha': 0.012458045234238267,
    'reg_lambda': 1.1061281247384551,
    'subsample': 0.6527164692215225,
    'subsample_freq': 9,
    'boosting_type': 'gbdt',  
    'objective': 'binary',  
    'n_estimators': 3000, 
    'random_state': 13
}

In [12]:
drop_cols = ['id', 'ts', 'target']
traget_col = 'target'

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
inner_kf = KFold(n_splits=5, shuffle=True, random_state=239)

df_ids = df['id'].unique()
folds_data = []

predicts = dict()

for outer_fold, (train_id, val_id) in enumerate(kf.split(df_ids)):

    outer_train_samples = df_ids[train_id]
    outer_val_samples = df_ids[val_id]
    
    outer_train_df = new_df[new_df.id.isin(outer_train_samples)]
    outer_val_df = new_df[new_df.id.isin(outer_val_samples)]
    
    predicts[outer_fold] = []
    
    for inner_fold, (inner_train_id, inner_val_id) in enumerate(inner_kf.split(outer_train_samples)):
        model = LGBMClassifier(**lgbm_params)
        inner_train_samples = outer_train_samples[inner_train_id]
        inner_val_samples = outer_train_samples[inner_val_id]
        
        inner_train_df = new_df[new_df.id.isin(inner_train_samples)]
        inner_val_df = new_df[new_df.id.isin(inner_val_samples)].reset_index(drop=True)
    
        model.fit(inner_train_df.drop(columns=drop_cols), inner_train_df[traget_col],
                  eval_set = (outer_val_df.drop(columns=drop_cols),
                              outer_val_df[traget_col]),
                  early_stopping_rounds=150,
                  eval_metric='logloss',
                  verbose=False)
        
        joblib.dump(model, f"lgbm_model_o{outer_fold}_i{inner_fold}.joblib")
        
        valid_y_true = outer_val_df[traget_col].values
        valid_y_proba = model.predict_proba(outer_val_df.drop(columns=drop_cols))[:,1]
        best_th , best_score = threshold_search(valid_y_true, valid_y_proba)
                     

        inner_test_y_true = inner_val_df[traget_col].values.astype(int)
        inner_test_y_predict = model.predict_proba(inner_val_df.drop(columns=drop_cols))[:,1]
        inner_val_df['predict'] = inner_test_y_predict
        
        predicts[outer_fold].append(inner_val_df[['id', 'ts', 'predict']])
        
        score = f1_score(inner_test_y_true, (inner_test_y_predict > best_th).astype(int))
        
        fold_data = {"outer_fold": outer_fold, 
                     "inner_fold": inner_fold, 
                     "th": best_th, 
                     "test_f1": score, 
                     "val_f1": best_score}
        folds_data.append(fold_data)
        print(fold_data)


{'outer_fold': 0, 'inner_fold': 0, 'th': 0.5657908885761026, 'test_f1': 0.8664987405541561, 'val_f1': 0.8413954851949575}
{'outer_fold': 0, 'inner_fold': 1, 'th': 0.5529787833180821, 'test_f1': 0.8620462046204621, 'val_f1': 0.8364365511314809}
{'outer_fold': 0, 'inner_fold': 2, 'th': 0.6557861618651933, 'test_f1': 0.7162764771460425, 'val_f1': 0.8505611340815121}
{'outer_fold': 0, 'inner_fold': 3, 'th': 0.5979417937866851, 'test_f1': 0.8760669170365312, 'val_f1': 0.8397660818713448}
{'outer_fold': 0, 'inner_fold': 4, 'th': 0.594316378026916, 'test_f1': 0.8688109866281172, 'val_f1': 0.8302872062663185}
{'outer_fold': 1, 'inner_fold': 0, 'th': 0.7265470665979523, 'test_f1': 0.8615494978479196, 'val_f1': 0.7608591885441527}
{'outer_fold': 1, 'inner_fold': 1, 'th': 0.671881389666264, 'test_f1': 0.862937062937063, 'val_f1': 0.7586044318717586}
{'outer_fold': 1, 'inner_fold': 2, 'th': 0.6974402504102639, 'test_f1': 0.8510329829648423, 'val_f1': 0.7580908032596042}
{'outer_fold': 1, 'inner_fo

In [14]:
results = pd.DataFrame(folds_data)

In [15]:
results.test_f1.mean()

0.8368666489626839

In [16]:
results.val_f1.mean()

0.8407358895450208

In [18]:
results.th.mean()

0.6183858871781912

In [52]:
df_feat = df[['id', 'time']].copy()

In [53]:
for i,v in predicts.items():
    part_df = pd.concat(v)
    part_df.columns = ['id', 'time', i]
    df_feat = df_feat.merge(part_df, how='left', left_on=['id', 'time'], right_on=['id', 'time'])

In [58]:
df_feat

Unnamed: 0,id,time,0,1,2,3,4
0,1,0,0.014039,0.057720,,0.029558,0.010225
1,1,780,0.022028,0.059655,,0.030751,0.004688
2,1,1572,0.013267,0.057079,,0.024134,0.007385
3,1,2392,0.019658,0.071962,,0.043476,0.021869
4,1,3196,0.022806,0.067860,,0.038839,0.010556
...,...,...,...,...,...,...,...
60482,275,115360,0.974030,0.884394,0.984954,0.971759,
60483,275,115912,0.952401,0.811262,0.970726,0.958469,
60484,275,116464,0.653215,0.655431,0.714523,0.718442,
60485,275,117020,0.180580,0.257770,0.156364,0.164410,


In [19]:
from scipy.stats import gmean

In [None]:
def gmean(x):
    

In [63]:
df_feat['mean'] = df_feat[[0, 1, 2, 3, 4]].mean(axis=1)

In [65]:
joblib.dump(df_feat[['id', 'time', 'mean']], 'feat.joblib')

['feat.joblib']

In [21]:
df_feat[[0, 1, 2, 3, 4]].loc[0]

NameError: name 'df_feat' is not defined