In [1]:
import pandas as pd
import os
from omegaconf import OmegaConf
import joblib
from sklearn.metrics import log_loss
import warnings
warnings.simplefilter('ignore')
import sys
sys.path.append('..')
from src.preprocessing import create_training_and_testing_data
from src import modelling
import importlib

In [2]:
%%time
Config = OmegaConf.load('../configs/data.yaml')

metadata = pd.read_csv(os.path.join(Config.DATA_DIR, 'metadata.csv'))
submission_format = pd.read_csv(os.path.join(Config.DATA_DIR, 'submission_format.csv'))

CPU times: user 14 ms, sys: 2.68 ms, total: 16.7 ms
Wall time: 14.3 ms


In [3]:
os.path.join(Config.MODEL_DIR, 'model_final.joblib')

'../models/model_final.joblib'

In [4]:
clf = joblib.load(os.path.join(Config.MODEL_DIR, 'model_final.joblib'))

In [5]:
#### Predictions being done separately
#### Can be done per sample_id as the competition requires
#### Just metadata for the sample with all columns needs to be passed

test_meta = metadata[metadata['split'].isin(['val', 'test'])].reset_index(drop=True)
test = create_training_and_testing_data(test_meta, labels=None)
preds = clf.predict(test)

Reading sample data


100%|██████████| 804/804 [00:04<00:00, 199.98it/s]


Starting Feature Engineering


100%|██████████| 6/6 [00:38<00:00,  6.41s/it]
100%|██████████| 5/5 [00:05<00:00,  1.19s/it]


Predicting on base models ..................



100%|██████████| 25/25 [00:38<00:00,  1.55s/it]



Adding meta model features ..................

Predicting on meta model ..................



100%|██████████| 31/31 [00:12<00:00,  2.44it/s]


In [6]:
test['preds'] = preds

In [7]:
target_inv_mapper = {
    0: 'basalt',
    1: 'carbonate',
    2: 'chloride',
    3: 'iron_oxide',
    4: 'oxalate',
    5: 'oxychlorine',
    6: 'phyllosilicate',
    7: 'silicate',
    8: 'sulfate',
    9: 'sulfide'}

In [8]:
tmp = pd.pivot_table(test[['sample_id', 'target_label', 'preds']], index='sample_id', columns='target_label', values='preds')
tmp.columns = [target_inv_mapper[c] for c in tmp.columns]

In [9]:
submission_format.set_index('sample_id', inplace=True)
submission_format.loc[tmp.index, tmp.columns] = tmp
submission_format = submission_format.reset_index()

In [10]:
submission_format

Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.005956,0.006670,0.005801,0.006181,0.005039,0.005492,0.005877,0.018863,0.005895,0.004924
1,S0767,0.010228,0.006565,0.006220,0.006668,0.006094,0.006319,0.009484,0.029787,0.007979,0.005950
2,S0768,0.986097,0.117195,0.010101,0.024809,0.007059,0.030536,0.980213,0.984519,0.139901,0.007205
3,S0769,0.004880,0.004983,0.016521,0.018518,0.005603,0.967942,0.067041,0.017040,0.973216,0.010303
4,S0770,0.004149,0.004910,0.020514,0.963360,0.014386,0.977964,0.977981,0.005547,0.004682,0.003245
...,...,...,...,...,...,...,...,...,...,...,...
799,S1565,0.249272,0.261199,0.029856,0.049306,0.021002,0.036268,0.043449,0.045787,0.051178,0.041913
800,S1566,0.195007,0.081339,0.015895,0.030167,0.009723,0.033261,0.118643,0.012025,0.016656,0.008129
801,S1567,0.037196,0.139999,0.019558,0.018965,0.011895,0.022444,0.224224,0.058456,0.069179,0.018665
802,S1568,0.025866,0.013000,0.009186,0.010708,0.022307,0.037386,0.014534,0.008838,0.008679,0.017175


In [11]:
submission_path = os.path.join(Config.OUTPUT_DIR, 'submission_final.csv')
submission_format.to_csv(submission_path, index=False)