In [1]:
import pandas as pd
import os
from omegaconf import OmegaConf
import joblib
from sklearn.metrics import log_loss
import warnings
warnings.simplefilter('ignore')
import sys
sys.path.append('..')
from src.preprocessing import create_training_and_testing_data
from src import modelling

In [2]:
%%time
Config = OmegaConf.load('../configs/data.yaml')

train_labels = pd.read_csv(os.path.join(Config.DATA_DIR, 'train_labels.csv'))
val_labels = pd.read_csv(os.path.join(Config.DATA_DIR, 'val_labels.csv'))
metadata = pd.read_csv(os.path.join(Config.DATA_DIR, 'metadata.csv'))

CPU times: user 10.6 ms, sys: 5.68 ms, total: 16.3 ms
Wall time: 14.6 ms


In [3]:
train_meta = metadata[metadata['split'].isin(['train', 'val'])].reset_index(drop=True)
print(train_meta.shape)

(1059, 5)


In [4]:
train_labels = train_labels.append(val_labels).reset_index(drop=True)
print(train_labels.shape)

(1059, 11)


In [5]:

print(train_meta['split'].value_counts())
train = create_training_and_testing_data(train_meta, labels=train_labels)

train    766
val      293
Name: split, dtype: int64
Reading sample data


100%|██████████| 1059/1059 [00:05<00:00, 207.36it/s]


Starting Feature Engineering


100%|██████████| 6/6 [00:48<00:00,  8.15s/it]
100%|██████████| 5/5 [00:08<00:00,  1.66s/it]


In [6]:
null_pct = (100 * (train.isnull().sum()/len(train))).sort_values(ascending=False)
null_pct

sample_id                                        0.0
sample_id_m/z_top_0.1_temp_abundance_mean53_4    0.0
sample_id_m/z_top_0.1_temp_abundance_mean61_0    0.0
sample_id_m/z_top_0.1_temp_abundance_mean61_1    0.0
sample_id_m/z_top_0.1_temp_abundance_mean61_3    0.0
                                                ... 
m/z_0_temp_51                                    0.0
m/z_0_temp_52                                    0.0
m/z_0_temp_53                                    0.0
m/z_0_temp_54                                    0.0
sample_id_pct_count_m/z_temp_bin_99_4            0.0
Length: 16493, dtype: float64

In [7]:
drop_cols = ['sample_id', 'split','features_path', 'instrument_type', 'features_md5_hash', 'target',
             'target_label_0', 'target_label_1', 'target_label_2', 'target_label_3',
       'target_label_4', 'target_label_5', 'target_label_6', 'target_label_7',
       'target_label_8', 'target_label_9', 'preds','target_label_0_x', 'target_label_1_x',
             'target_label_2_x', 'target_label_3_x', 'target_label_4_x', 'target_label_5_x',
             'target_label_6_x', 'target_label_7_x', 'target_label_8_x', 'target_label_9_x',
             'target_label_0_y', 'target_label_1_y', 'target_label_2_y', 'target_label_3_y',
             'target_label_4_y', 'target_label_5_y', 'target_label_6_y', 'target_label_7_y',
             'target_label_8_y', 'target_label_9_y']

std_fts = train.std().sort_values()
drop_std_fts = std_fts[std_fts == 0].index.tolist()
nunq_fts = train.nunique()
drop_nunq_fts = nunq_fts[nunq_fts == 1].index.tolist()

drop_cols = drop_cols + drop_nunq_fts + drop_std_fts

fts = [c for c in train.columns if c not in drop_cols]

TARGET_COL = 'target'
len(fts)

16320

In [8]:
clf = modelling.MetaModel(fts=fts, cat_cols=None, group_col='sample_id', n_splits_base=25, n_splits_meta=31, hyper_conf_path='../configs/hyperparams.yaml')
clf.fit(train, train['target'])

Starting to fit base models ..................


Total features being used are: 16320


[50]	training's binary_logloss: 0.134915	valid_1's binary_logloss: 0.222948
[100]	training's binary_logloss: 0.0635724	valid_1's binary_logloss: 0.173095
[150]	training's binary_logloss: 0.0361804	valid_1's binary_logloss: 0.159233
[200]	training's binary_logloss: 0.023067	valid_1's binary_logloss: 0.152427
[250]	training's binary_logloss: 0.0155806	valid_1's binary_logloss: 0.150144
[300]	training's binary_logloss: 0.0108505	valid_1's binary_logloss: 0.148789
[350]	training's binary_logloss: 0.00799978	valid_1's binary_logloss: 0.148402
[400]	training's binary_logloss: 0.00622012	valid_1's binary_logloss: 0.15093
[450]	training's binary_logloss: 0.00492507	valid_1's binary_logloss: 0.153326
Fold 0 val score: 0.14840230149851608
[50]	training's binary_logloss: 0.13406	valid_1's binary_logloss: 0.195835
[100]	training's binary_logloss: 0.063291	valid_1's binary_logloss: 0.149241
[150]	training's bina

In [9]:
joblib.dump(clf, os.path.join(Config.MODEL_DIR, 'model_final.joblib'))

['../models/model_final.joblib']