In [1]:
import pandas as pd
import os
from omegaconf import OmegaConf
import joblib
from sklearn.metrics import log_loss
import warnings
warnings.simplefilter('ignore')
import sys
sys.path.append('..')
from src.preprocessing import create_training_and_testing_data
from src import modelling

In [2]:
%%time
Config = OmegaConf.load('../configs/data.yaml')

train_labels = pd.read_csv(os.path.join(Config.DATA_DIR, 'train_labels.csv'))
val_labels = pd.read_csv(os.path.join(Config.DATA_DIR, 'val_labels.csv'))
metadata = pd.read_csv(os.path.join(Config.DATA_DIR, 'metadata.csv'))

CPU times: user 14.2 ms, sys: 2.77 ms, total: 17 ms
Wall time: 15.1 ms


In [3]:
train_labels = train_labels.append(val_labels).reset_index(drop=True)

In [4]:
train_meta = metadata[metadata['split'] == 'train'].reset_index(drop=True)
train = create_training_and_testing_data(train_meta, labels=train_labels)

Reading sample data


100%|██████████| 766/766 [00:03<00:00, 191.89it/s]


Starting Feature Engineering


100%|██████████| 6/6 [00:35<00:00,  5.84s/it]
100%|██████████| 5/5 [00:06<00:00,  1.21s/it]


In [5]:
null_pct = (100 * (train.isnull().sum()/len(train))).sort_values(ascending=False)
null_pct

sample_id                                        0.0
sample_id_m/z_top_0.1_temp_abundance_mean78_1    0.0
sample_id_m/z_top_0.1_temp_abundance_mean75_0    0.0
sample_id_m/z_top_0.1_temp_abundance_mean75_2    0.0
sample_id_m/z_top_0.1_temp_abundance_mean75_3    0.0
                                                ... 
m/z_4_abun_72                                    0.0
m/z_4_abun_73                                    0.0
m/z_4_abun_74                                    0.0
m/z_4_abun_75                                    0.0
sample_id_pct_count_m/z_temp_bin_99_4            0.0
Length: 16261, dtype: float64

In [6]:
drop_cols = ['sample_id', 'split','features_path', 'instrument_type', 'features_md5_hash', 'target', 'preds']

fts = [c for c in train.columns if c not in drop_cols]

len(fts)


16255

In [7]:
clf = modelling.MetaModel(fts=fts, cat_cols=None, group_col='sample_id', n_splits_base=25, n_splits_meta=50, hyper_conf_path='../configs/hyperparams.yaml')
clf.fit(train, train['target'])

Starting to fit base models ..................


Total features being used are: 16255


[50]	training's binary_logloss: 0.154368	valid_1's binary_logloss: 0.271013
[100]	training's binary_logloss: 0.0800131	valid_1's binary_logloss: 0.227856
[150]	training's binary_logloss: 0.0475057	valid_1's binary_logloss: 0.216483
[200]	training's binary_logloss: 0.0288335	valid_1's binary_logloss: 0.214223
[250]	training's binary_logloss: 0.0195761	valid_1's binary_logloss: 0.22213
Fold 0 val score: 0.2130920446417906
[50]	training's binary_logloss: 0.157789	valid_1's binary_logloss: 0.230401
[100]	training's binary_logloss: 0.0794758	valid_1's binary_logloss: 0.182426
[150]	training's binary_logloss: 0.0448827	valid_1's binary_logloss: 0.166951
[200]	training's binary_logloss: 0.0285202	valid_1's binary_logloss: 0.160705
[250]	training's binary_logloss: 0.0192364	valid_1's binary_logloss: 0.163242
Fold 1 val score: 0.1605023385866901
[50]	training's binary_logloss: 0.155883	valid_1's binary_loglo

In [8]:
#### Predictions being done separately
#### Can be done per sample_id as the competition requires
#### Just metadata for the sample with all columns needs to be passed

val_meta = metadata[metadata['split'] == 'val'].reset_index(drop=True)
val = create_training_and_testing_data(val_meta, labels=val_labels)
preds = clf.predict(val)

score = log_loss(val['target'], preds)
print(f'Val score: {score}')

Reading sample data


100%|██████████| 293/293 [00:02<00:00, 111.57it/s]


Starting Feature Engineering


100%|██████████| 6/6 [00:11<00:00,  2.00s/it]
100%|██████████| 5/5 [00:02<00:00,  2.22it/s]


Predicting on base models ..................



100%|██████████| 25/25 [00:15<00:00,  1.67it/s]



Adding meta model features ..................

Predicting on meta model ..................



100%|██████████| 50/50 [00:18<00:00,  2.66it/s]

Val score: 0.12573647041336636





In [9]:
joblib.dump(clf, os.path.join(Config.MODEL_DIR, 'model_train_only.joblib'))

['../models/model_train_only.joblib']