In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import cv2
# LightAutoML presets, task and report generation
from lightautoml.automl.presets.image_presets import TabularCVAutoML
from lightautoml.tasks import Task

In [3]:
np.random.seed(42)
torch.set_num_threads(2)

In [4]:
INPUT_DIR = './data/icebergs'

In [5]:
train = pd.read_json(f'{INPUT_DIR}/train.json')
train['inc_angle'] = pd.to_numeric(train['inc_angle'],errors='coerce')

In [6]:
train.head()

Unnamed: 0,id,band_1,band_2,inc_angle,is_iceberg
0,dfd5f913,"[-27.878360999999998, -27.15416, -28.668615, -...","[-27.154118, -29.537888, -31.0306, -32.190483,...",43.9239,0
1,e25388fd,"[-12.242375, -14.920304999999999, -14.920363, ...","[-31.506321, -27.984554, -26.645678, -23.76760...",38.1562,0
2,58b2aaa0,"[-24.603676, -24.603714, -24.871029, -23.15277...","[-24.870956, -24.092632, -20.653963, -19.41104...",45.2859,1
3,4cfc3a18,"[-22.454607, -23.082819, -23.998013, -23.99805...","[-27.889421, -27.519794, -27.165262, -29.10350...",43.8306,0
4,271f93f4,"[-26.006956, -23.164886, -23.164886, -26.89116...","[-27.206915, -30.259186, -30.259186, -23.16495...",35.6256,0


## Add features from EDA

In [7]:
def get_stats(train: pd.DataFrame,
              label:int = 1) -> pd.DataFrame:
    train['max'+str(label)] = [np.max(np.array(x)) for x in train['band_'+ str(label)] ]
    train['min'+str(label)] = [np.min(np.array(x)) for x in train['band_'+ str(label)] ]
    train['std'+str(label)] = [np.std(np.array(x)) for x in train['band_'+ str(label)] ]
    train['mean'+str(label)] = [np.mean(np.array(x)) for x in train['band_'+ str(label)] ]
    return train
train = get_stats(train, 1)
train = get_stats(train, 2)

In [8]:
train.head()

Unnamed: 0,id,band_1,band_2,inc_angle,is_iceberg,max1,min1,std1,mean1,max2,min2,std2,mean2
0,dfd5f913,"[-27.878360999999998, -27.15416, -28.668615, -...","[-27.154118, -29.537888, -31.0306, -32.190483,...",43.9239,0,-0.213149,-38.211376,2.764537,-27.911043,-11.252153,-41.135918,2.381284,-29.910117
1,e25388fd,"[-12.242375, -14.920304999999999, -14.920363, ...","[-31.506321, -27.984554, -26.645678, -23.76760...",38.1562,0,12.570409,-23.125309,3.142532,-13.566554,0.044052,-34.765831,2.934098,-25.359106
2,58b2aaa0,"[-24.603676, -24.603714, -24.871029, -23.15277...","[-24.870956, -24.092632, -20.653963, -19.41104...",45.2859,1,-9.918477,-33.391197,2.223905,-23.053698,-15.605879,-34.148819,2.125275,-24.839821
3,4cfc3a18,"[-22.454607, -23.082819, -23.998013, -23.99805...","[-27.889421, -27.519794, -27.165262, -29.10350...",43.8306,0,4.795627,-32.204136,2.566233,-23.210771,-5.554516,-39.564053,2.38998,-29.567913
4,271f93f4,"[-26.006956, -23.164886, -23.164886, -26.89116...","[-27.206915, -30.259186, -30.259186, -23.16495...",35.6256,0,-6.956036,-35.010487,2.305288,-25.280029,-9.434006,-40.276115,2.276244,-25.627726


## Get images paths and EDA features into DataFrame

In [9]:
new_imgs = []
for i, row in train.iterrows():
    new_imgs.append((row.id, f'./data/icebergs/train_imgs/{row.id}_1.jpg', row.is_iceberg, row.inc_angle, row.max1, row.min1, row.std1, row.mean1))
    new_imgs.append((row.id, f'./data/icebergs/train_imgs/{row.id}_2.jpg', row.is_iceberg, row.inc_angle, row.max2, row.min2, row.std2, row.mean2))
train_data = pd.DataFrame(new_imgs, columns = ['id', 'path', 'label', 'angle', "max", "min", "std", "mean"])
train_data.head()

Unnamed: 0,id,path,label,angle,max,min,std,mean
0,dfd5f913,./data/icebergs/train_imgs/dfd5f913_1.jpg,0,43.9239,-0.213149,-38.211376,2.764537,-27.911043
1,dfd5f913,./data/icebergs/train_imgs/dfd5f913_2.jpg,0,43.9239,-11.252153,-41.135918,2.381284,-29.910117
2,e25388fd,./data/icebergs/train_imgs/e25388fd_1.jpg,0,38.1562,12.570409,-23.125309,3.142532,-13.566554
3,e25388fd,./data/icebergs/train_imgs/e25388fd_2.jpg,0,38.1562,0.044052,-34.765831,2.934098,-25.359106
4,58b2aaa0,./data/icebergs/train_imgs/58b2aaa0_1.jpg,1,45.2859,-9.918477,-33.391197,2.223905,-23.053698


## Lama pipeline in use

In [11]:
# scoring metric for this competition is F1 score
task = Task(name='binary',
            metric = lambda y_true, y_pred: f1_score(y_true, (y_pred > 0.5)*1))

In [12]:
roles = {
    'target': 'label',
    'path': ['path'],
    'group': 'id',
    'numeric': ['angle', 'max', 'min', 'std', 'mean']
}

In [27]:
automl = TabularCVAutoML(task=task,
                         timeout=5*3600,
                         cpu_limit=6,
                         reader_params = {'cv': 3, 'random_state': 42})

In [28]:
oof_pred = automl.fit_predict(train_data, roles=roles, verbose=3)

[22:25:04] Stdout logging level is INFO3.
[22:25:04] Task: binary

[22:25:04] Start automl preset with listed constraints:
[22:25:04] - time: 18000.00 seconds
[22:25:04] - CPU: 6 cores
[22:25:04] - memory: 16 GB

[22:25:04] [1mTrain data shape: (3208, 8)[0m

[22:25:08] Feats was rejected during automatic roles guess: []
[22:25:08] Layer [1m1[0m train process start. Time left 17996.41 secs
Loaded pretrained weights for efficientnet-b0
[22:25:16] Load saved dataset for path
[22:25:16] Feature path transformed
[22:25:16] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[22:25:16] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[22:25:17] Linear model: C = 1e-05 score = 0.7220573689416419
[22:25:17] Linear model: C = 5e-05 score = 0.7531956735496559
[22:25:17] Linear model: C = 0.0001 score = 0.771964461994077
[22:25:17] Linear model: C = 0.0005 score = 0.7983870967741935
[22:25:17] Linear model: C = 0.001 score = 0.8040609137055836
[22:25:17]



[22:25:25] 0:	learn: 0.6775307	test: 0.6777931	best: 0.6777931 (0)	total: 37.9ms	remaining: 18.9s
[22:25:28] bestTest = 0.4392635167
[22:25:28] bestIteration = 498
[22:25:28] Shrink model to first 499 iterations.
[22:25:29] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_1_Mod_0_CatBoost[0m =====
[22:25:29] 0:	learn: 0.6800608	test: 0.6807594	best: 0.6807594 (0)	total: 8.76ms	remaining: 4.37s




[22:25:32] bestTest = 0.4375001426
[22:25:32] bestIteration = 482
[22:25:32] Shrink model to first 483 iterations.
[22:25:32] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_1_Mod_0_CatBoost[0m =====
[22:25:32] 0:	learn: 0.6812192	test: 0.6816955	best: 0.6816955 (0)	total: 7.58ms	remaining: 3.78s




[22:25:35] bestTest = 0.4159949942
[22:25:35] bestIteration = 498
[22:25:35] Shrink model to first 499 iterations.
[22:25:35] Fitting [1mLvl_0_Pipe_1_Mod_0_CatBoost[0m finished. score = [1m0.7831131163390267[0m
[22:25:35] [1mLvl_0_Pipe_1_Mod_0_CatBoost[0m fitting and predicting completed
[22:25:35] Time left 17968.91 secs

[22:25:35] [1mLayer 1 training completed.[0m

[22:25:35] Blending: optimization starts with equal weights and score [1m0.8415550473701405[0m
[22:25:35] Blending: iteration [1m0[0m: score = [1m0.8425865447419987[0m, weights = [1m[0.4984867 0.5015133][0m
[22:25:35] Blending: iteration [1m1[0m: score = [1m0.8425865447419987[0m, weights = [1m[0.4984867 0.5015133][0m
[22:25:35] Blending: no score update. Terminated

[22:25:35] [1mAutoml preset training completed in 31.16 seconds[0m

[22:25:35] Model description:
Final prediction for new objects (level 0) = 
	 0.49849 * (3 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.50151 * (3 averaged models

In [52]:
preds = train_data[['id', 'label']]

In [53]:
preds['pred'] = oof_pred.data[:]
preds = preds.groupby(['id', 'label']).mean().reset_index()
# accuracy
print(f'acc: {np.mean((preds.pred > 0.5).ravel() == preds.label)}')

acc: 0.8802992518703242


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds['pred'] = oof_pred.data[:]


# Preprocess data the same way for the test dataset

In [41]:
test = pd.read_json(f'{INPUT_DIR}/test.json')
test['inc_angle'] = pd.to_numeric(test['inc_angle'],errors='coerce')

In [42]:
test

Unnamed: 0,id,band_1,band_2,inc_angle
0,5941774d,"[-15.863251, -15.201077, -17.887735, -19.17248...","[-21.629612, -21.142353, -23.908337, -28.34524...",34.966400
1,4023181e,"[-26.058969497680664, -26.058969497680664, -26...","[-25.754207611083984, -25.754207611083984, -25...",32.615072
2,b20200e4,"[-14.14109992980957, -15.064241409301758, -17....","[-14.74563980102539, -14.590410232543945, -14....",37.505433
3,e7f018bb,"[-12.167478, -13.706167, -16.54837, -13.572674...","[-24.32222, -26.375538, -24.096739, -23.8769, ...",34.473900
4,4371c8c3,"[-23.37459373474121, -26.02718162536621, -28.1...","[-25.72234344482422, -27.011577606201172, -23....",43.918874
...,...,...,...,...
8419,16ee9b50,"[-25.082357, -26.71583, -24.599827, -25.082571...","[-25.860718, -23.29442, -25.860861, -25.334354...",34.795500
8420,5a599eb7,"[-21.031391143798828, -21.031391143798828, -21...","[-23.755836486816406, -23.755836486816406, -23...",32.246683
8421,df30d6dd,"[-28.609278, -26.514626, -26.514679, -26.83061...","[-28.609278, -29.437183, -30.35239, -31.375494...",39.503200
8422,18af95b1,"[-27.068821, -27.068892, -23.970854, -22.38730...","[-29.991381, -29.163599, -24.886002, -27.71266...",33.638000


In [43]:
test = get_stats(test, 1)
test = get_stats(test, 2)

In [44]:
new_imgs_test = []
for i, row in test.iterrows():
    band_1 = np.reshape(np.array(row.band_1),(75,75))
    band_1 = cv2.normalize(band_1, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    band_2 = np.reshape(np.array(row.band_2),(75,75))
    band_2 = cv2.normalize(band_2, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    cv2.imwrite(f'./data/icebergs/test_imgs/{row.id}_1.jpg', band_1*255)
    cv2.imwrite(f'./data/icebergs/test_imgs/{row.id}_2.jpg', band_2*255)
    new_imgs_test.append((row.id, f'./data/icebergs/test_imgs/{row.id}_1.jpg', row.inc_angle, row.max1, row.min1, row.std1, row.mean1))
    new_imgs_test.append((row.id, f'./data/icebergs/test_imgs/{row.id}_2.jpg', row.inc_angle, row.max2, row.min2, row.std2, row.mean2))

In [45]:
test_data  = pd.DataFrame(new_imgs_test, columns = ['id', 'path', 'angle', "max", "min", "std", "mean"])

In [46]:
te_pred = automl.predict(test_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')

[22:31:55] Load saved dataset for path
[22:31:55] Feature path transformed
Prediction for te_data:
array([[0.05492836],
       [0.669874  ],
       [0.2610988 ],
       ...,
       [0.89462507],
       [0.02455477],
       [0.05861698]], dtype=float32)
Shape = (16848, 1)


In [47]:
submission = test_data[['id']]
submission['is_iceberg'] = te_pred.data
submission = submission.groupby(['id']).mean().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['is_iceberg'] = te_pred.data


In [48]:
submission

Unnamed: 0,id,is_iceberg
0,000c65d7,0.736580
1,001680af,0.526701
2,00193999,0.572906
3,00247e48,0.851838
4,00503faa,0.502105
...,...,...
8419,ffd5b968,0.411623
8420,ffda8ccc,0.906196
8421,ffe50836,0.100369
8422,ffef5f6b,0.798795


In [49]:
submission.to_csv('./data/submissions/iceberg_submission_lama_eda_feat.csv', index=False)

## Now let's try pipeline with some different settings

Add LGBM to blending, change visual encoder to efficientnet-b3, change number of CV folds

In [13]:
automl_b3_lgb = TabularCVAutoML(task = task,
                         general_params={
                           'use_algos': [['linear_l2', 'cb', 'lgb']]  
                         },
                         timeout=5 * 3600,
                         autocv_features={"embed_model": 'efficientnet-b3'},
                         cpu_limit = 2,
                         reader_params = {'cv': 5, 'random_state': 42})

In [14]:
oof_pred = automl_b3_lgb.fit_predict(train_data, roles=roles, verbose=3)

[23:54:15] Stdout logging level is INFO3.
[23:54:15] Task: binary

[23:54:15] Start automl preset with listed constraints:
[23:54:15] - time: 18000.00 seconds
[23:54:15] - CPU: 2 cores
[23:54:15] - memory: 16 GB

[23:54:15] [1mTrain data shape: (3208, 8)[0m

[23:54:18] Feats was rejected during automatic roles guess: []
[23:54:18] Layer [1m1[0m train process start. Time left 17997.39 secs
Loaded pretrained weights for efficientnet-b3
[23:54:23] Load saved dataset for path
[23:54:23] Feature path transformed
[23:54:24] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[23:54:24] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[23:54:24] Linear model: C = 1e-05 score = 0.7196401799100451
[23:54:24] Linear model: C = 5e-05 score = 0.7566718995290422
[23:54:24] Linear model: C = 0.0001 score = 0.7740916271721958
[23:54:24] Linear model: C = 0.0005 score = 0.8063492063492065
[23:54:24] Linear model: C = 0.001 score = 0.8205128205128205
[23:54:24



[23:54:43] 0:	learn: 0.6805706	test: 0.6827102	best: 0.6827102 (0)	total: 22.8ms	remaining: 11.4s
[23:54:47] bestTest = 0.4785069736
[23:54:47] bestIteration = 477
[23:54:47] Shrink model to first 478 iterations.
[23:54:47] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_1_Mod_1_CatBoost[0m =====
[23:54:47] 0:	learn: 0.6787738	test: 0.6800371	best: 0.6800371 (0)	total: 6.96ms	remaining: 3.47s




[23:54:50] bestTest = 0.4250654654
[23:54:50] bestIteration = 493
[23:54:50] Shrink model to first 494 iterations.
[23:54:50] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_1_Mod_1_CatBoost[0m =====
[23:54:50] 0:	learn: 0.6815882	test: 0.6808770	best: 0.6808770 (0)	total: 7.07ms	remaining: 3.53s




[23:54:53] bestTest = 0.441986084
[23:54:53] bestIteration = 396
[23:54:53] Shrink model to first 397 iterations.
[23:54:53] ===== Start working with [1mfold 3[0m for [1mLvl_0_Pipe_1_Mod_1_CatBoost[0m =====
[23:54:53] 0:	learn: 0.6790970	test: 0.6808030	best: 0.6808030 (0)	total: 6.74ms	remaining: 3.36s




[23:54:56] bestTest = 0.428965013
[23:54:56] bestIteration = 496
[23:54:56] Shrink model to first 497 iterations.
[23:54:56] ===== Start working with [1mfold 4[0m for [1mLvl_0_Pipe_1_Mod_1_CatBoost[0m =====
[23:54:56] 0:	learn: 0.6787101	test: 0.6782697	best: 0.6782697 (0)	total: 6.18ms	remaining: 3.08s




[23:55:00] bestTest = 0.391325593
[23:55:00] bestIteration = 499
[23:55:00] Fitting [1mLvl_0_Pipe_1_Mod_1_CatBoost[0m finished. score = [1m0.7854381443298968[0m
[23:55:00] [1mLvl_0_Pipe_1_Mod_1_CatBoost[0m fitting and predicting completed
[23:55:00] Time left 17955.45 secs

[23:55:00] [1mLayer 1 training completed.[0m

[23:55:00] Blending: optimization starts with equal weights and score [1m0.8437602358336064[0m
[23:55:00] Blending: iteration [1m0[0m: score = [1m0.8540218470705064[0m, weights = [1m[0.5361579  0.3768723  0.08696972][0m
[23:55:00] Blending: iteration [1m1[0m: score = [1m0.8567682726974762[0m, weights = [1m[0.300882   0.618034   0.08108404][0m
[23:55:00] Blending: iteration [1m2[0m: score = [1m0.8567682726974762[0m, weights = [1m[0.300882   0.618034   0.08108404][0m
[23:55:00] Blending: no score update. Terminated

[23:55:00] [1mAutoml preset training completed in 44.66 seconds[0m

[23:55:00] Model description:
Final prediction for new object

In [15]:
preds = train_data[['id', 'label']]

In [16]:
preds['pred'] = oof_pred.data[:]
preds = preds.groupby(['id', 'label']).mean().reset_index()
# accuracy
print(f'acc: {np.mean((preds.pred > 0.5).ravel() == preds.label)}')

acc: 0.89214463840399


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds['pred'] = oof_pred.data[:]


In [21]:
te_pred = automl_b3_lgb.predict(test_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')

100%|██████████| 132/132 [06:18<00:00,  2.87s/it]


[00:35:15] Feature path transformed
Prediction for te_data:
array([[0.03721453],
       [0.64732355],
       [0.58381045],
       ...,
       [0.9313757 ],
       [0.03356019],
       [0.06219441]], dtype=float32)
Shape = (16848, 1)


In [22]:
submission = test_data[['id']]
submission['is_iceberg'] = te_pred.data
submission = submission.groupby(['id']).mean().reset_index()
submission.to_csv('./data/submissions/iceberg_submission_lama_eda_feat_effb3_lgbm.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['is_iceberg'] = te_pred.data
