In [1]:
import pandas as pd
import numpy as np
import plotly
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import joblib

In [2]:
df = pd.read_csv(Path('../data/train.csv'))

In [3]:
df = df.sort_values(['id', 'time'])

In [4]:
samples = df['id'].unique()
WINDOW_SIZE = 25
HALF_WINDOW = int((WINDOW_SIZE-1)/2)
QART_WINDOW = int((WINDOW_SIZE-1)/4)
LOW_BOUND = 380
UPPER_BOUND = 1400

In [5]:
def pop_arr(arr, num):
    s = len(arr)
    return np.resize(np.repeat(np.resize(arr[:s], (1,s)), num, axis=0), (num*s,))

In [6]:
def timedomain(rr, mask):
    results = {}
    
    rr = rr[mask==1]
    
    if len(rr) > 1:
        hr = 60000/rr
    
        results['mean_RR'] = np.mean(rr)
        results['std_rr_sdnn'] = np.std(rr)
        results['mean_hr_kubious'] = 60000/np.mean(rr)
        results['mean_hr'] = np.mean(hr)
        results['std_hr'] = np.std(hr)
        results['min_hr'] = np.min(hr)
        results['max_hr'] = np.max(hr)
        results['rmssd'] = np.sqrt(np.mean(np.square(np.diff(rr))))
        results['nn_xx'] = np.sum(np.abs(np.diff(rr)) > 50)*1
        results['pnn_xx'] = 100 * np.sum((np.abs(np.diff(rr)) > 50)*1) / len(rr)
    else:
        results['mean_RR'] = 0
        results['std_rr_sdnn'] = 0
        results['mean_hr_kubious'] = 0
        results['mean_hr'] = 0
        results['std_hr'] = 0
        results['min_hr'] = 0
        results['max_hr'] = 0
        results['rmssd'] = 0
        results['nn_xx'] = 0
        results['pnn_xx'] = 0

    return results

In [7]:
new_df = []
for sample in tqdm(samples):
    # get array
    loc_df = df[df.id==sample].sort_values('time').reset_index(drop=True)
    arr = loc_df['x'].values
    
    target = loc_df['y'].values
    
    time = loc_df['time'].values
    
    mask = ((arr>LOW_BOUND) & (arr < UPPER_BOUND)).astype(int)
    # ext array with 8 points to each direction
    
    arr = np.concatenate((
        pop_arr(arr[:2], QART_WINDOW),
        arr,
        pop_arr(arr[-2:], QART_WINDOW),
    ))
    
    mask = np.concatenate((
        pop_arr([0], HALF_WINDOW),
        mask,
        pop_arr([0], HALF_WINDOW)
    ))
    for i in range(HALF_WINDOW+1, len(arr)-HALF_WINDOW+1):
        local_window = arr[i-HALF_WINDOW-1:i+HALF_WINDOW+2]
        local_mask = mask[i-HALF_WINDOW-1:i+HALF_WINDOW+2]
        new_df.append({'id': sample, 
                       'ts': time[i-HALF_WINDOW-1],
                       **{f"x_{x}":y for x,y in zip(range(-8,9), local_window)},
                      'target': target[i-HALF_WINDOW-1],
                       **{f"mask_{x}":y for x,y in zip(range(-8,9), local_mask)},
                       **timedomain(local_window, local_mask),
                       **{f"delta_{x}":y for x,y in zip(range(WINDOW_SIZE-1), np.diff(local_window))},
                      })


  0%|          | 0/229 [00:00<?, ?it/s]

In [8]:
new_df=pd.DataFrame(new_df)

In [9]:
new_df.shape

(60487, 63)

In [10]:
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score

def threshold_search(y_true, y_proba):
    precision , recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1 / (precision + 1e-18) + 1 / (recall + 1e-18))
    F[F>1.0] = 0
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    return best_th , best_score

In [11]:
lgbm_params = {
    'num_leaves': 52, 
    'learning_rate': 0.028067101239171117, 
    'class_weight': None, 
    'min_split_gain': 0.2079596284724723, 
    'min_child_samples': 26, 
    'subsample': 0.9964018869175387, 
    'subsample_freq': 3, 
    'colsample_bytree': 0.7621469158134937, 
    'reg_alpha': 0.0007822636610799144, 
    'reg_lambda': 4.076057666254058,
    'boosting_type': 'gbdt',  
    'objective': 'binary',  
    'n_estimators': 3000, 
    'random_state': 13
}

In [12]:
drop_cols = ['id', 'ts', 'target']
traget_col = 'target'

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

predicts = dict()
for outer_fold in range(5):
    for inner_fold in tqdm(range(5), desc=f"o{outer_fold}"):
        # load model
        model = joblib.load(f"lgbm_model_o{outer_fold}_i{inner_fold}.joblib")
        #make predict
        predict = model.predict_proba(new_df.drop(columns=drop_cols))[:,1]
        #store predict
        predicts[f"o{outer_fold}_i{inner_fold}"] = predict

o0:   0%|          | 0/5 [00:00<?, ?it/s]

o1:   0%|          | 0/5 [00:00<?, ?it/s]

o2:   0%|          | 0/5 [00:00<?, ?it/s]

o3:   0%|          | 0/5 [00:00<?, ?it/s]

o4:   0%|          | 0/5 [00:00<?, ?it/s]

In [22]:
th = 0.4
f1_score(new_df.target.values.astype(int), (pd.DataFrame(predicts).mean(axis=1) > th).astype(int))

0.922933157431838

In [23]:
th = 0.33471250635551086
f1_score(new_df.target.values.astype(int), (pd.DataFrame(predicts).mean(axis=1) > th).astype(int))

0.9233153422754234