# Hybrid models based on stacking


## Python imports

In [4]:
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import pickle
import sklearn
import tensorflow as tf

## Parameters

In [27]:
BRSSI_MODEL_ID = 'RandomForest'
VISION_MODEL_ID = 'MobileNetV1'
SOURCE_FILTER = 'Pixel'
TEST_SETS = { 'TS', 'PW', 'RW'}
LABELS = [
    "AH", "AT_CA", "AT_CH", "AT_I1", "AT_I2", "AT_M", "AT_O1",
    "AT_O2", "AT_S", "CN", "DC", "DF", "DG", "ES",
    "GL", "HA", "SA", "SN", "SS", "TMA", "TS"
]

## Load BRSSI and vision models 

In [28]:
if SOURCE_FILTER != None:
    BRSSI_MODEL_ID += '_' + SOURCE_FILTER
    VISION_MODEL_ID += '_' + SOURCE_FILTER
BRSSI_MODEL = pickle.load(open('models/brssi/' + BRSSI_MODEL_ID + '.pkl','rb'))
VISION_MODEL = tf.keras.models.load_model('models/vision/' + VISION_MODEL_ID )

## Functions to load BRSSI and CNN  data & model outputs

In [29]:
def load_data(id):
    brssi_data = pd.read_csv('datasets/brssi/' + id + '.tsv', sep='\t')
    vision_data = pd.read_csv('datasets/vision/' + id + '.tsv', sep='\t')
    if id == 'train' and SOURCE_FILTER != None:
        brssi_data = brssi_data[brssi_data.source == SOURCE_FILTER]
        vision_data = vision_data[vision_data.source == SOURCE_FILTER]
    brssi_data.sort_values(['label','source','time'],inplace=True)
    vision_data.sort_values(['label','source','time'],inplace=True)
    brssi_data = brssi_data.merge(vision_data[['label','time','source']])
    vision_data = vision_data.merge(brssi_data[['label','time','source']])
    brssi_model_outputs = BRSSI_MODEL.predict_proba(brssi_data.drop(columns=['label','time','source']))
    keras_dataset = ImageDataGenerator(rescale=1.0/255.0).flow_from_dataframe(
        vision_data,
        target_size=(224,224),
        batch_size=32,
        class_mode='sparse',
        shuffle=False,
        seed=1234567,
        x_col='image',
        y_col='label'
    )
    vision_model_outputs = VISION_MODEL.predict(keras_dataset)
    combined_df = brssi_data[['label','time','source']].copy()
    for idx,cat in  enumerate(BRSSI_MODEL.classes_):
        combined_df['b_' + cat] = [ brssi_model_outputs[i][idx] for i in range(len(combined_df)) ]
        combined_df['v_' + cat] = [ vision_model_outputs[i][idx] for i in range(len(combined_df)) ]
    return combined_df

## Load training data

In [30]:
train_df = load_data('train')

train_df

Found 2059 validated image filenames belonging to 21 classes.


Unnamed: 0,label,time,source,b_AH,v_AH,b_AT_CA,v_AT_CA,b_AT_CH,v_AT_CH,b_AT_I1,...,b_SA,v_SA,b_SN,v_SN,b_SS,v_SS,b_TMA,v_TMA,b_TS,v_TS
0,AH,0,Pixel,0.64,0.985869,0.0,1.753445e-04,0.0,1.555948e-03,0.0,...,0.28,6.977582e-05,0.0,1.491326e-03,0.0,1.558152e-03,0.0,8.319007e-04,0.02,2.609553e-05
1,AH,1,Pixel,0.89,0.999383,0.0,1.568524e-05,0.0,5.473981e-05,0.0,...,0.11,1.131431e-06,0.0,6.388238e-07,0.0,9.960706e-08,0.0,1.055467e-04,0.00,4.483639e-06
2,AH,2,Pixel,0.94,0.999999,0.0,7.980377e-10,0.0,1.689834e-08,0.0,...,0.06,1.698887e-10,0.0,1.543366e-09,0.0,3.452477e-09,0.0,1.245083e-07,0.00,2.692221e-08
3,AH,3,Pixel,0.91,0.997499,0.0,1.120578e-07,0.0,3.103148e-06,0.0,...,0.04,9.727854e-06,0.0,7.542567e-04,0.0,2.469560e-04,0.0,8.496677e-04,0.04,1.838710e-06
4,AH,4,Pixel,1.00,0.949283,0.0,2.855478e-05,0.0,1.484073e-05,0.0,...,0.00,6.597480e-06,0.0,2.617431e-05,0.0,5.630431e-06,0.0,1.836065e-04,0.00,5.967029e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,TS,117,Pixel,0.01,0.000149,0.0,6.245918e-07,0.0,6.298470e-06,0.0,...,0.00,4.431462e-05,0.0,4.718353e-06,0.0,4.643252e-06,0.0,1.781402e-06,0.99,9.988378e-01
2055,TS,118,Pixel,0.27,0.000863,0.0,1.217730e-06,0.0,6.344961e-07,0.0,...,0.00,1.353652e-06,0.0,5.460336e-06,0.0,1.818763e-06,0.0,8.259837e-06,0.73,9.988846e-01
2056,TS,120,Pixel,0.09,0.000085,0.0,4.683734e-06,0.0,8.616818e-08,0.0,...,0.00,9.407765e-07,0.0,1.098324e-04,0.0,8.360281e-08,0.0,1.733616e-05,0.91,9.992898e-01
2057,TS,121,Pixel,0.08,0.000002,0.0,8.780786e-08,0.0,2.841117e-08,0.0,...,0.03,1.417771e-07,0.0,8.395741e-07,0.0,6.314052e-08,0.0,6.031767e-07,0.89,9.999548e-01


## Train models

In [31]:
MODELS = {
    'KNN': sklearn.neighbors.KNeighborsClassifier(n_neighbors=20),
    'LogisticRegression': sklearn.linear_model.LogisticRegression(random_state=54321),
    'RandomForest': sklearn.ensemble.RandomForestClassifier(random_state=54321, n_estimators=50)
}

X = train_df.drop(columns=['label','time','source'])
y = train_df['label']
for mid, m in MODELS.items():
    m.fit(X,y)

## Evaluate models for the test set

In [32]:
for id in ['TS', 'PW', 'RW']:
    test_df = load_data(id)
    X = test_df.drop(columns=['label','time','source'])
    y = test_df['label']
    for mid, m in MODELS.items():
        if SOURCE_FILTER != None:
            mid += '_' + SOURCE_FILTER
        print(mid, m.score(X,y))
        results = test_df[['label','time','source']].copy()
        y_pred = m.predict_proba(X)
        for idx,cat in enumerate(LABELS):
            results[cat] = [ y[idx] for y in y_pred]
        results['prediction'] = results[LABELS].idxmax(axis='columns')
        results = results.loc[:, ['label','time','source','prediction'] + LABELS]
        results.to_csv('results/hybrid/' + mid + '_' + id + '.tsv', header=True, index=False, sep='\t')


Found 941 validated image filenames belonging to 21 classes.
KNN_Pixel 0.8862911795961743
LogisticRegression_Pixel 0.900106269925611
RandomForest_Pixel 0.8437832093517534
Found 363 validated image filenames belonging to 21 classes.
KNN_Pixel 0.7961432506887053
LogisticRegression_Pixel 0.8099173553719008
RandomForest_Pixel 0.7575757575757576
Found 446 validated image filenames belonging to 21 classes.
KNN_Pixel 0.8004484304932735
LogisticRegression_Pixel 0.8094170403587444
RandomForest_Pixel 0.7242152466367713
