# Hybrid models based on stacking


## Python imports

In [6]:
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import pickle
import sklearn
import tensorflow as tf

## Parameters

In [7]:
BRSSI_MODEL_ID = 'RandomForest'
VISION_MODEL_ID = 'MobileNetV1'
SOURCE_FILTER = None
TEST_SETS = { 'TS', 'PW', 'RW'}
LABELS = [
    "AH", "AT_CA", "AT_CH", "AT_I1", "AT_I2", "AT_M", "AT_O1",
    "AT_O2", "AT_S", "CN", "DC", "DF", "DG", "ES",
    "GL", "HA", "SA", "SN", "SS", "TMA", "TS"
]

## Load BRSSI and vision models 

In [8]:
if SOURCE_FILTER != None:
    BRSSI_MODEL_ID += '_' + SOURCE_FILTER
    VISION_MODEL_ID += '_' + SOURCE_FILTER
BRSSI_MODEL = pickle.load(open('models/brssi/' + BRSSI_MODEL_ID + '.pkl','rb'))
VISION_MODEL = tf.keras.models.load_model('models/vision/' + VISION_MODEL_ID )

## Functions to load BRSSI and CNN  data & model outputs

In [9]:
def load_data(id):
    brssi_data = pd.read_csv('datasets/brssi/' + id + '.tsv', sep='\t')
    vision_data = pd.read_csv('datasets/vision/' + id + '.tsv', sep='\t')
    if id == 'train' and SOURCE_FILTER != None:
        brssi_data = brssi_data[brssi_data.source == SOURCE_FILTER]
        vision_data = vision_data[vision_data.source == SOURCE_FILTER]
    brssi_data.sort_values(['label','source','time'],inplace=True)
    vision_data.sort_values(['label','source','time'],inplace=True)
    brssi_data = brssi_data.merge(vision_data[['label','time','source']])
    vision_data = vision_data.merge(brssi_data[['label','time','source']])
    brssi_model_outputs = BRSSI_MODEL.predict_proba(brssi_data.drop(columns=['label','time','source']))
    keras_dataset = ImageDataGenerator(rescale=1.0/255.0).flow_from_dataframe(
        vision_data,
        target_size=(224,224),
        batch_size=32,
        class_mode='sparse',
        shuffle=False,
        seed=1234567,
        x_col='image',
        y_col='label'
    )
    vision_model_outputs = VISION_MODEL.predict(keras_dataset)
    combined_df = brssi_data[['label','time','source']].copy()
    for idx,cat in  enumerate(BRSSI_MODEL.classes_):
        combined_df['b_' + cat] = [ brssi_model_outputs[i][idx] for i in range(len(combined_df)) ]
        combined_df['v_' + cat] = [ vision_model_outputs[i][idx] for i in range(len(combined_df)) ]
    return combined_df

## Load training data

In [10]:
train_df = load_data('train')

train_df

Found 3743 validated image filenames belonging to 21 classes.


Unnamed: 0,label,time,source,b_AH,v_AH,b_AT_CA,v_AT_CA,b_AT_CH,v_AT_CH,b_AT_I1,...,b_SA,v_SA,b_SN,v_SN,b_SS,v_SS,b_TMA,v_TMA,b_TS,v_TS
0,AH,0,Pixel,0.7250,0.991760,0.000000,4.414013e-06,0.000000,2.960334e-05,0.000000,...,0.230,1.300369e-05,0.002000,2.953717e-04,0.0,3.398636e-04,0.000000,7.766753e-04,0.005000,1.016225e-06
1,AH,1,Pixel,0.9000,0.999195,0.000000,2.040466e-07,0.000000,2.273519e-06,0.000000,...,0.095,3.908642e-09,0.000000,1.188104e-07,0.0,4.361221e-09,0.000000,8.002517e-05,0.005000,9.453680e-07
2,AH,2,Pixel,0.9400,1.000000,0.000000,1.153878e-12,0.000000,9.031167e-11,0.000000,...,0.050,2.174078e-13,0.000000,4.090981e-11,0.0,1.865411e-11,0.000000,2.532474e-08,0.010000,9.667375e-11
3,AH,3,Pixel,0.9150,0.999945,0.000000,7.601166e-11,0.000000,8.605655e-09,0.000000,...,0.040,1.480408e-06,0.000000,2.031269e-07,0.0,1.068915e-06,0.000000,4.370165e-05,0.035000,2.222027e-08
4,AH,4,Pixel,0.9900,0.984727,0.000000,2.516413e-07,0.000000,1.458451e-07,0.000000,...,0.000,2.513575e-07,0.000000,8.459638e-07,0.0,7.135672e-08,0.000000,1.976761e-05,0.010000,3.579576e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3738,TS,112,Redmi,0.1575,0.005610,0.000000,8.182667e-05,0.001147,3.113529e-03,0.000789,...,0.005,3.497143e-02,0.000000,7.668749e-03,0.0,2.246108e-03,0.000263,8.424406e-03,0.812412,8.160450e-01
3739,TS,113,Redmi,0.1550,0.000625,0.000000,1.068341e-04,0.000000,3.428302e-04,0.000000,...,0.005,1.915657e-03,0.000000,7.101412e-05,0.0,2.275016e-05,0.000000,3.671278e-05,0.815000,9.628345e-01
3740,TS,114,Redmi,0.0500,0.017462,0.000000,6.985118e-05,0.000000,3.235603e-05,0.000000,...,0.005,5.227939e-06,0.000000,7.831494e-05,0.0,2.224267e-08,0.000000,6.089124e-07,0.945000,9.718162e-01
3741,TS,120,Redmi,0.0150,0.037314,0.000000,1.664548e-05,0.000000,1.208513e-04,0.001176,...,0.000,6.907595e-04,0.000000,5.082106e-04,0.0,1.050581e-06,0.015882,2.014956e-04,0.825882,9.396654e-01


## Train models

In [11]:
MODELS = {
    'KNN': sklearn.neighbors.KNeighborsClassifier(n_neighbors=20),
    'LogisticRegression': sklearn.linear_model.LogisticRegression(random_state=54321),
    'RandomForest': sklearn.ensemble.RandomForestClassifier(random_state=54321, n_estimators=50)
}

X = train_df.drop(columns=['label','time','source'])
y = train_df['label']
for mid, m in MODELS.items():
    m.fit(X,y)

## Evaluate models for the test set

In [12]:
for id in ['TS', 'PW', 'RW']:
    test_df = load_data(id)
    X = test_df.drop(columns=['label','time','source'])
    y = test_df['label']
    for mid, m in MODELS.items():
        if SOURCE_FILTER != None:
            mid += '_' + SOURCE_FILTER
        print(mid, m.score(X,y))
        results = test_df[['label','time','source']].copy()
        y_pred = m.predict_proba(X)
        for idx,cat in enumerate(LABELS):
            results[cat] = [ y[idx] for y in y_pred]
        results['prediction'] = results[LABELS].idxmax(axis='columns')
        results = results.loc[:, ['label','time','source','prediction'] + LABELS]
        results.to_csv('results/hybrid/' + mid + '_' + id + '.tsv', header=True, index=False, sep='\t')


Found 941 validated image filenames belonging to 21 classes.
KNN 0.9766206163655685
LogisticRegression 0.9776833156216791
RandomForest 0.9617428267800212
Found 363 validated image filenames belonging to 21 classes.
KNN 0.8539944903581267
LogisticRegression 0.859504132231405
RandomForest 0.8402203856749312
Found 446 validated image filenames belonging to 21 classes.
KNN 0.9013452914798207
LogisticRegression 0.9080717488789237
RandomForest 0.8609865470852018
