In [1]:
import pandas as pd
import numpy as np
import plotly
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import joblib

In [2]:
df = pd.read_csv(Path('../data/test.csv'))

In [3]:
df = df.sort_values(['id', 'time'])

In [4]:
samples = df['id'].unique()
WINDOW_SIZE = 25
HALF_WINDOW = int((WINDOW_SIZE-1)/2)
QART_WINDOW = int((WINDOW_SIZE-1)/4)
LOW_BOUND = 380
UPPER_BOUND = 1400

In [5]:
def pop_arr(arr, num):
    s = len(arr)
    return np.resize(np.repeat(np.resize(arr[:s], (1,s)), num, axis=0), (num*s,))

In [6]:
def timedomain(rr, mask):
    results = {}
    
    rr = rr[mask==1]
    
    if len(rr) > 1:
        hr = 60000/rr
    
        results['mean_RR'] = np.mean(rr)
        results['std_rr_sdnn'] = np.std(rr)
        results['mean_hr_kubious'] = 60000/np.mean(rr)
        results['mean_hr'] = np.mean(hr)
        results['std_hr'] = np.std(hr)
        results['min_hr'] = np.min(hr)
        results['max_hr'] = np.max(hr)
        results['rmssd'] = np.sqrt(np.mean(np.square(np.diff(rr))))
        results['nn_xx'] = np.sum(np.abs(np.diff(rr)) > 50)*1
        results['pnn_xx'] = 100 * np.sum((np.abs(np.diff(rr)) > 50)*1) / len(rr)
    else:
        results['mean_RR'] = 0
        results['std_rr_sdnn'] = 0
        results['mean_hr_kubious'] = 0
        results['mean_hr'] = 0
        results['std_hr'] = 0
        results['min_hr'] = 0
        results['max_hr'] = 0
        results['rmssd'] = 0
        results['nn_xx'] = 0
        results['pnn_xx'] = 0

    return results

In [7]:
new_df = []
for sample in tqdm(samples):
    # get array
    loc_df = df[df.id==sample].sort_values('time').reset_index(drop=True)
    arr = loc_df['x'].values
    
    target = np.zeros(len(arr))
    
    time = loc_df['time'].values
    
    mask = ((arr>LOW_BOUND) & (arr < UPPER_BOUND)).astype(int)
    # ext array with 8 points to each direction
    
    arr = np.concatenate((
        pop_arr(arr[:2], QART_WINDOW),
        arr,
        pop_arr(arr[-2:], QART_WINDOW),
    ))
    
    mask = np.concatenate((
        pop_arr([0], HALF_WINDOW),
        mask,
        pop_arr([0], HALF_WINDOW)
    ))
    for i in range(HALF_WINDOW+1, len(arr)-HALF_WINDOW+1):
        local_window = arr[i-HALF_WINDOW-1:i+HALF_WINDOW+2]
        local_mask = mask[i-HALF_WINDOW-1:i+HALF_WINDOW+2]
        new_df.append({'id': sample, 
                       'ts': time[i-HALF_WINDOW-1],
                       **{f"x_{x}":y for x,y in zip(range(-8,9), local_window)},
                      'target': target[i-HALF_WINDOW-1],
                       **{f"mask_{x}":y for x,y in zip(range(-8,9), local_mask)},
                       **timedomain(local_window, local_mask),
                       **{f"delta_{x}":y for x,y in zip(range(WINDOW_SIZE-1), np.diff(local_window))},
                      })


  0%|          | 0/46 [00:00<?, ?it/s]

In [8]:
new_df=pd.DataFrame(new_df)

In [9]:
new_df.shape

(15034, 71)

In [10]:
df.shape

(15034, 3)

In [11]:
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score

def threshold_search(y_true, y_proba):
    precision , recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1 / (precision + 1e-18) + 1 / (recall + 1e-18))
    F[F>1.0] = 0
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    return best_th , best_score

In [13]:
drop_cols = ['id', 'ts', 'target']
traget_col = 'target'

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

predicts = dict()
for outer_fold in range(5):
    for inner_fold in tqdm(range(5), desc=f"o{outer_fold}"):
        # load model
        model = joblib.load(f"lgbm_model_o{outer_fold}_i{inner_fold}.joblib")
        #make predict
        predict = model.predict_proba(new_df.drop(columns=drop_cols))[:,1]
        #store predict
        predicts[f"o{outer_fold}_i{inner_fold}"] = predict

o0:   0%|          | 0/5 [00:00<?, ?it/s]

o1:   0%|          | 0/5 [00:00<?, ?it/s]

o2:   0%|          | 0/5 [00:00<?, ?it/s]

o3:   0%|          | 0/5 [00:00<?, ?it/s]

o4:   0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
th = 0.4

In [20]:
new_df['predict'] = (pd.DataFrame(predicts).mean(axis=1) > th).astype(int)

In [25]:
new_df['proba']=pd.DataFrame(predicts).mean(axis=1)

In [21]:
submit_df = new_df[['id', 'ts', 'predict']]
submit_df.columns = ['id', 'time', 'y']
submit_df = submit_df.merge(df[['id', 'time', 'x']], on=['id', 'time'])
submit_df

Unnamed: 0,id,time,y,x
0,8,0,0,748
1,8,744,0,744
2,8,1476,0,732
3,8,2208,0,732
4,8,2976,0,768
...,...,...,...,...
15029,274,94380,0,580
15030,274,95564,0,1184
15031,274,96156,0,592
15032,274,96740,0,584


In [40]:
new_df.columns = ['id', 'time', 'x_-8', 'x_-7', 'x_-6', 'x_-5', 'x_-4', 'x_-3', 'x_-2',
       'x_-1', 'x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8',
       'target', 'mask_-8', 'mask_-7', 'mask_-6', 'mask_-5', 'mask_-4',
       'mask_-3', 'mask_-2', 'mask_-1', 'mask_0', 'mask_1', 'mask_2', 'mask_3',
       'mask_4', 'mask_5', 'mask_6', 'mask_7', 'mask_8', 'mean_RR',
       'std_rr_sdnn', 'mean_hr_kubious', 'mean_hr', 'std_hr', 'min_hr',
       'max_hr', 'rmssd', 'nn_xx', 'pnn_xx', 'delta_0', 'delta_1', 'delta_2',
       'delta_3', 'delta_4', 'delta_5', 'delta_6', 'delta_7', 'delta_8',
       'delta_9', 'delta_10', 'delta_11', 'delta_12', 'delta_13', 'delta_14',
       'delta_15', 'delta_16', 'delta_17', 'delta_18', 'delta_19', 'delta_20',
       'delta_21', 'delta_22', 'delta_23', 'predict', 'proba', 'gmean']

In [22]:
submit_df[['id', 'time', 'x', 'y']].to_csv('submit_2_g_naive.csv', header=True, index=False)

In [23]:
submit = pd.read_csv('../data/sample_submission.csv')

In [29]:
submit.drop(columns=['y'], inplace=True)

In [30]:
submit = submit.merge(submit_df[['id', 'time', 'y']], on=['id', 'time'])

In [32]:
submit.to_csv('submit_2_g_naive.csv', header=True, index=False)

In [35]:
joblib.dump(new_df[['id', 'ts', 'proba']], 'feat1_test.joblib')

['feat1_test.joblib']

In [37]:
from scipy.stats import gmean
def nan_gmean(x):
    return gmean(x[~np.isnan(x)])

In [38]:
new_df['gmean']=pd.DataFrame(predicts).apply(nan_gmean,axis=1)

In [41]:
joblib.dump(new_df[['id', 'time', 'gmean']], 'feat1_test_gmean.joblib')

['feat1_test_gmean.joblib']