# Bluetooth RSSI - model training

## Python imports

In [1]:
import numpy as np
import os.path
import pandas as pd
import pickle
import sklearn
import sklearn.ensemble
import sklearn.linear_model
import sklearn.model_selection
import sklearn.neural_network
import sklearn.preprocessing
import sklearn.svm

## Read dataset 

In [2]:
DATASET_FILE  = 'datasets/brssi/train.tsv' 
train_data = pd.read_csv(DATASET_FILE,sep='\t')
train_data

Unnamed: 0,time,label,source,6859b8e6126b,c69294c41e47,e7b2d23d89ec,e9bdcc7d8fe6,ef3b3dd2a002,ef3b3dd2a003,ef3b3dd2a005,...,ef3b3dd2a019,ef3b3dd2a020,ef3b3dd2e001,ef3b3dd2e003,ef3b3dd2e004,ef3b3dd2e006,ef3b3dd2e007,ef3b3dd2e008,ef3b3dd2e009,f80332eda645
0,116,DG,Pixel,-89.5,-200.00,-98.000000,-84.000000,-94.500000,-200.000000,-101.000000,...,-82.000000,-200.000000,-95.0,-92.000000,-200.0,-200.000000,-200.000000,-200.00,-200.0,-98.000000
1,65,GL,Pixel,-200.0,-200.00,-100.666667,-200.000000,-88.666667,-87.000000,-98.250000,...,-104.000000,-94.000000,-200.0,-102.666667,-200.0,-89.333333,-200.000000,-98.50,-200.0,-200.000000
2,6,TS,Pixel,-85.5,-200.00,-200.000000,-87.000000,-200.000000,-200.000000,-200.000000,...,-96.000000,-200.000000,-200.0,-200.000000,-200.0,-200.000000,-200.000000,-200.00,-200.0,-200.000000
3,73,AT_O2,Redmi,-200.0,-200.00,-82.200000,-94.500000,-66.500000,-86.000000,-71.000000,...,-96.250000,-90.500000,-200.0,-92.000000,-200.0,-81.500000,-200.000000,-200.00,-95.0,-94.000000
4,26,AT_S,Redmi,-98.0,-94.25,-91.333333,-85.666667,-84.000000,-95.000000,-86.200000,...,-83.500000,-93.000000,-200.0,-75.000000,-200.0,-200.000000,-91.250000,-200.00,-200.0,-86.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3836,78,TMA,Pixel,-200.0,-200.00,-90.750000,-92.750000,-92.500000,-200.000000,-97.000000,...,-85.000000,-200.000000,-87.0,-200.000000,-96.5,-200.000000,-200.000000,-200.00,-200.0,-200.000000
3837,11,AT_CH,Pixel,-104.0,-106.00,-93.750000,-100.000000,-92.500000,-101.000000,-94.500000,...,-90.500000,-100.000000,-105.0,-86.333333,-200.0,-200.000000,-92.600000,-200.00,-200.0,-100.000000
3838,4,ES,Redmi,-200.0,-92.00,-91.000000,-97.000000,-89.500000,-86.333333,-88.500000,...,-200.000000,-80.666667,-200.0,-89.000000,-200.0,-94.000000,-200.000000,-74.25,-85.0,-96.500000
3839,118,SA,Pixel,-200.0,-200.00,-101.500000,-95.000000,-200.000000,-200.000000,-92.000000,...,-200.000000,-200.000000,-200.0,-89.500000,-200.0,-200.000000,-82.333333,-200.00,-200.0,-85.000000


## Train models

In [3]:
model_types = {
    "AdaBoost": {
        "estimator":
            sklearn.ensemble.AdaBoostClassifier(random_state=54321, algorithm="SAMME"),
        "param_grid": {
            "n_estimators":  [200, 300, 400], 
            "learning_rate": [1.0, 0.75, 0.5]
        }
    },
    "DecisionTree": {
        "estimator":
            sklearn.tree.DecisionTreeClassifier(random_state=54321),
        "param_grid": {
            "max_features": [None,'log2','sqrt'],
            "min_samples_split": [2, 0.01, 0.02]
        }
    },
    "GradientBoost": {
        "estimator":
            sklearn.ensemble.GradientBoostingClassifier(random_state=54321),
        "param_grid": {
            "n_estimators":  [50, 100], 
            "learning_rate": [0.1, 0.2]
        }
    },
    "KNN": {
        "estimator": 
            sklearn.neighbors.KNeighborsClassifier(),
        "param_grid": { 
            "n_neighbors":  [5, 20, 80, 160], 
            "weights": ["uniform", "distance"], 
            "algorithm": ["ball_tree", "kd_tree", "brute" ] 
        },
    },
    "LinearSVM":  {
        "estimator": 
            sklearn.svm.SVC(kernel="linear", random_state=54321, probability=True),
        "param_grid": { 
            "C":  [4.0, 2.0, 1.0, 0.5, 0.25, 0.125, 0.06, 0.03, 0.01, 0.005, 0.001, 0.0001],
            "max_iter": [100000]
        },
    },
    "MLP": {
        "estimator": 
            sklearn.neural_network.MLPClassifier(random_state=54321, early_stopping=True, max_iter=100),
        "param_grid": { 
            "hidden_layer_sizes":  [[1000], [500,500],[333,334,333],[250,250,250,250]],
            "alpha": [0.01, 0.05, 0.125, 0.25, 0.5, 1.0]
        },
    },
    "RandomForest":  {
        "estimator": 
            sklearn.ensemble.RandomForestClassifier(random_state=54321),
        "param_grid": { 
            "n_estimators":  [50, 100, 200], 
            "max_depth": [None, 10, 20] 
        },
    },
    "RBFSVM":  {
        "estimator": 
            sklearn.svm.SVC(kernel="rbf", random_state=54321, probability=True),
        "param_grid": { 
            "C":  [1024.0, 512.0, 256.0, 128.0, 64.0, 32.0, 16.0, 8.0, 4.0, 2.0, 1.5, 1.0, 0.5, 0.25, 0.125],
            "gamma": ['scale'],
            "max_iter": [100000]
        },
    }  
}
K_FOLD_SPLITS = 4
MODEL_FILTER = ''
SOURCE_FILTER = None
SUFFIX = ''

if SOURCE_FILTER != None:
    SUFFIX = '_' + SOURCE_FILTER
    train_data = train_data[train_data.source == SOURCE_FILTER]
    
X = train_data.drop(columns=['time','source','label'])
y = train_data['label']
cv_k_fold = sklearn.model_selection.KFold(n_splits=K_FOLD_SPLITS, shuffle=True, random_state=12345)
                                    
model_summary = []
for name,prop in model_types.items():
    if MODEL_FILTER == None or MODEL_FILTER in name:
        print("====", name, "====")
        gs = sklearn.model_selection.GridSearchCV(
            prop["estimator"],
            cv = cv_k_fold,
            param_grid = prop["param_grid"],
            return_train_score=True,
            verbose=3
        )
        gs.fit(X,y)

        with open(os.path.join('models/brssi/', name + SUFFIX + ".pkl"), 'wb') as output_file:
            pickle.dump(gs.best_estimator_, output_file)
            model_summary.append([
                name,
                gs.best_score_, gs.best_estimator_ , gs.best_params_
            ])

==== AdaBoost ====
Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV 1/4] END learning_rate=1.0, n_estimators=200;, score=(train=0.711, test=0.665) total time=   1.2s
[CV 2/4] END learning_rate=1.0, n_estimators=200;, score=(train=0.690, test=0.670) total time=   1.2s
[CV 3/4] END learning_rate=1.0, n_estimators=200;, score=(train=0.661, test=0.641) total time=   1.2s
[CV 4/4] END learning_rate=1.0, n_estimators=200;, score=(train=0.666, test=0.652) total time=   1.2s
[CV 1/4] END learning_rate=1.0, n_estimators=300;, score=(train=0.685, test=0.660) total time=   1.9s
[CV 2/4] END learning_rate=1.0, n_estimators=300;, score=(train=0.712, test=0.691) total time=   1.8s
[CV 3/4] END learning_rate=1.0, n_estimators=300;, score=(train=0.698, test=0.669) total time=   1.8s
[CV 4/4] END learning_rate=1.0, n_estimators=300;, score=(train=0.701, test=0.701) total time=   1.9s
[CV 1/4] END learning_rate=1.0, n_estimators=400;, score=(train=0.713, test=0.686) total time=   2.4s
[CV

[CV 2/4] END learning_rate=0.1, n_estimators=100;, score=(train=1.000, test=0.905) total time=  18.9s
[CV 3/4] END learning_rate=0.1, n_estimators=100;, score=(train=1.000, test=0.889) total time=  18.8s
[CV 4/4] END learning_rate=0.1, n_estimators=100;, score=(train=1.000, test=0.898) total time=  18.9s
[CV 1/4] END learning_rate=0.2, n_estimators=50;, score=(train=1.000, test=0.883) total time=   9.5s
[CV 2/4] END learning_rate=0.2, n_estimators=50;, score=(train=1.000, test=0.905) total time=   9.4s
[CV 3/4] END learning_rate=0.2, n_estimators=50;, score=(train=1.000, test=0.892) total time=   9.4s
[CV 4/4] END learning_rate=0.2, n_estimators=50;, score=(train=1.000, test=0.880) total time=   9.4s
[CV 1/4] END learning_rate=0.2, n_estimators=100;, score=(train=1.000, test=0.890) total time=  18.7s
[CV 2/4] END learning_rate=0.2, n_estimators=100;, score=(train=1.000, test=0.916) total time=  18.9s
[CV 3/4] END learning_rate=0.2, n_estimators=100;, score=(train=1.000, test=0.898) tot

[CV 1/4] END algorithm=kd_tree, n_neighbors=160, weights=distance;, score=(train=1.000, test=0.523) total time=   0.1s
[CV 2/4] END algorithm=kd_tree, n_neighbors=160, weights=distance;, score=(train=1.000, test=0.531) total time=   0.1s
[CV 3/4] END algorithm=kd_tree, n_neighbors=160, weights=distance;, score=(train=1.000, test=0.551) total time=   0.1s
[CV 4/4] END algorithm=kd_tree, n_neighbors=160, weights=distance;, score=(train=1.000, test=0.536) total time=   0.1s
[CV 1/4] END algorithm=brute, n_neighbors=5, weights=uniform;, score=(train=0.705, test=0.565) total time=   0.1s
[CV 2/4] END algorithm=brute, n_neighbors=5, weights=uniform;, score=(train=0.704, test=0.558) total time=   0.0s
[CV 3/4] END algorithm=brute, n_neighbors=5, weights=uniform;, score=(train=0.714, test=0.556) total time=   0.0s
[CV 4/4] END algorithm=brute, n_neighbors=5, weights=uniform;, score=(train=0.704, test=0.570) total time=   0.0s
[CV 1/4] END algorithm=brute, n_neighbors=5, weights=distance;, scor



[CV 1/4] END C=4.0, max_iter=100000;, score=(train=0.804, test=0.697) total time=   8.9s




[CV 2/4] END C=4.0, max_iter=100000;, score=(train=0.780, test=0.677) total time=   9.3s




[CV 3/4] END C=4.0, max_iter=100000;, score=(train=0.813, test=0.699) total time=   7.8s




[CV 4/4] END C=4.0, max_iter=100000;, score=(train=0.761, test=0.646) total time=   8.9s




[CV 1/4] END C=2.0, max_iter=100000;, score=(train=0.805, test=0.685) total time=   8.6s




[CV 2/4] END C=2.0, max_iter=100000;, score=(train=0.783, test=0.662) total time=   9.3s




[CV 3/4] END C=2.0, max_iter=100000;, score=(train=0.829, test=0.699) total time=   7.8s




[CV 4/4] END C=2.0, max_iter=100000;, score=(train=0.754, test=0.646) total time=   8.9s




[CV 1/4] END C=1.0, max_iter=100000;, score=(train=0.853, test=0.709) total time=   7.9s




[CV 2/4] END C=1.0, max_iter=100000;, score=(train=0.831, test=0.699) total time=   8.7s




[CV 3/4] END C=1.0, max_iter=100000;, score=(train=0.862, test=0.729) total time=   7.3s




[CV 4/4] END C=1.0, max_iter=100000;, score=(train=0.829, test=0.718) total time=   8.5s




[CV 1/4] END C=0.5, max_iter=100000;, score=(train=0.907, test=0.748) total time=   7.2s




[CV 2/4] END C=0.5, max_iter=100000;, score=(train=0.899, test=0.741) total time=   7.9s




[CV 3/4] END C=0.5, max_iter=100000;, score=(train=0.912, test=0.785) total time=   6.7s




[CV 4/4] END C=0.5, max_iter=100000;, score=(train=0.900, test=0.746) total time=   7.8s




[CV 1/4] END C=0.25, max_iter=100000;, score=(train=0.931, test=0.761) total time=   6.3s




[CV 2/4] END C=0.25, max_iter=100000;, score=(train=0.929, test=0.746) total time=   6.8s




[CV 3/4] END C=0.25, max_iter=100000;, score=(train=0.932, test=0.785) total time=   5.9s




[CV 4/4] END C=0.25, max_iter=100000;, score=(train=0.924, test=0.768) total time=   6.6s




[CV 1/4] END C=0.125, max_iter=100000;, score=(train=0.928, test=0.763) total time=   5.3s




[CV 2/4] END C=0.125, max_iter=100000;, score=(train=0.928, test=0.747) total time=   5.8s




[CV 3/4] END C=0.125, max_iter=100000;, score=(train=0.933, test=0.784) total time=   5.2s




[CV 4/4] END C=0.125, max_iter=100000;, score=(train=0.923, test=0.774) total time=   5.7s




[CV 1/4] END C=0.06, max_iter=100000;, score=(train=0.927, test=0.761) total time=   4.7s




[CV 2/4] END C=0.06, max_iter=100000;, score=(train=0.926, test=0.749) total time=   4.9s




[CV 3/4] END C=0.06, max_iter=100000;, score=(train=0.930, test=0.787) total time=   4.5s




[CV 4/4] END C=0.06, max_iter=100000;, score=(train=0.924, test=0.780) total time=   4.9s




[CV 1/4] END C=0.03, max_iter=100000;, score=(train=0.925, test=0.772) total time=   3.9s




[CV 2/4] END C=0.03, max_iter=100000;, score=(train=0.922, test=0.754) total time=   4.2s




[CV 3/4] END C=0.03, max_iter=100000;, score=(train=0.928, test=0.793) total time=   3.9s




[CV 4/4] END C=0.03, max_iter=100000;, score=(train=0.922, test=0.775) total time=   4.2s




[CV 1/4] END C=0.01, max_iter=100000;, score=(train=0.920, test=0.778) total time=   2.8s




[CV 2/4] END C=0.01, max_iter=100000;, score=(train=0.919, test=0.762) total time=   3.0s




[CV 3/4] END C=0.01, max_iter=100000;, score=(train=0.918, test=0.792) total time=   2.7s




[CV 4/4] END C=0.01, max_iter=100000;, score=(train=0.915, test=0.777) total time=   3.1s




[CV 1/4] END C=0.005, max_iter=100000;, score=(train=0.912, test=0.777) total time=   2.1s




[CV 2/4] END C=0.005, max_iter=100000;, score=(train=0.916, test=0.770) total time=   2.1s




[CV 3/4] END C=0.005, max_iter=100000;, score=(train=0.917, test=0.791) total time=   2.0s




[CV 4/4] END C=0.005, max_iter=100000;, score=(train=0.910, test=0.775) total time=   2.1s
[CV 1/4] END C=0.001, max_iter=100000;, score=(train=0.889, test=0.767) total time=   1.0s
[CV 2/4] END C=0.001, max_iter=100000;, score=(train=0.894, test=0.765) total time=   1.1s
[CV 3/4] END C=0.001, max_iter=100000;, score=(train=0.901, test=0.787) total time=   1.0s
[CV 4/4] END C=0.001, max_iter=100000;, score=(train=0.894, test=0.778) total time=   1.0s
[CV 1/4] END C=0.0001, max_iter=100000;, score=(train=0.796, test=0.724) total time=   0.7s
[CV 2/4] END C=0.0001, max_iter=100000;, score=(train=0.813, test=0.700) total time=   0.7s
[CV 3/4] END C=0.0001, max_iter=100000;, score=(train=0.807, test=0.726) total time=   0.7s
[CV 4/4] END C=0.0001, max_iter=100000;, score=(train=0.804, test=0.711) total time=   0.7s




==== MLP ====
Fitting 4 folds for each of 24 candidates, totalling 96 fits
[CV 1/4] END alpha=0.01, hidden_layer_sizes=[1000];, score=(train=0.762, test=0.664) total time=   3.0s
[CV 2/4] END alpha=0.01, hidden_layer_sizes=[1000];, score=(train=0.796, test=0.646) total time=   2.8s
[CV 3/4] END alpha=0.01, hidden_layer_sizes=[1000];, score=(train=0.785, test=0.647) total time=   2.7s
[CV 4/4] END alpha=0.01, hidden_layer_sizes=[1000];, score=(train=0.841, test=0.680) total time=   3.2s
[CV 1/4] END alpha=0.01, hidden_layer_sizes=[500, 500];, score=(train=0.934, test=0.698) total time=   6.4s
[CV 2/4] END alpha=0.01, hidden_layer_sizes=[500, 500];, score=(train=0.818, test=0.664) total time=   4.7s
[CV 3/4] END alpha=0.01, hidden_layer_sizes=[500, 500];, score=(train=0.852, test=0.667) total time=   4.7s
[CV 4/4] END alpha=0.01, hidden_layer_sizes=[500, 500];, score=(train=0.822, test=0.693) total time=   3.5s
[CV 1/4] END alpha=0.01, hidden_layer_sizes=[333, 334, 333];, score=(train=0.

[CV 3/4] END alpha=0.5, hidden_layer_sizes=[333, 334, 333];, score=(train=0.944, test=0.706) total time=   6.0s
[CV 4/4] END alpha=0.5, hidden_layer_sizes=[333, 334, 333];, score=(train=0.912, test=0.686) total time=   5.0s
[CV 1/4] END alpha=0.5, hidden_layer_sizes=[250, 250, 250, 250];, score=(train=0.856, test=0.657) total time=   4.2s
[CV 2/4] END alpha=0.5, hidden_layer_sizes=[250, 250, 250, 250];, score=(train=0.918, test=0.670) total time=   6.1s
[CV 3/4] END alpha=0.5, hidden_layer_sizes=[250, 250, 250, 250];, score=(train=0.945, test=0.703) total time=   7.1s
[CV 4/4] END alpha=0.5, hidden_layer_sizes=[250, 250, 250, 250];, score=(train=0.858, test=0.666) total time=   4.1s
[CV 1/4] END alpha=1.0, hidden_layer_sizes=[1000];, score=(train=0.793, test=0.668) total time=   2.5s
[CV 2/4] END alpha=1.0, hidden_layer_sizes=[1000];, score=(train=0.788, test=0.656) total time=   2.2s
[CV 3/4] END alpha=1.0, hidden_layer_sizes=[1000];, score=(train=0.774, test=0.673) total time=   2.3s

[CV 1/4] END C=32.0, gamma=scale, max_iter=100000;, score=(train=0.991, test=0.745) total time=   1.3s
[CV 2/4] END C=32.0, gamma=scale, max_iter=100000;, score=(train=0.989, test=0.757) total time=   1.3s
[CV 3/4] END C=32.0, gamma=scale, max_iter=100000;, score=(train=0.989, test=0.739) total time=   1.3s
[CV 4/4] END C=32.0, gamma=scale, max_iter=100000;, score=(train=0.988, test=0.768) total time=   1.3s
[CV 1/4] END C=16.0, gamma=scale, max_iter=100000;, score=(train=0.978, test=0.734) total time=   1.2s
[CV 2/4] END C=16.0, gamma=scale, max_iter=100000;, score=(train=0.974, test=0.758) total time=   1.3s
[CV 3/4] END C=16.0, gamma=scale, max_iter=100000;, score=(train=0.978, test=0.746) total time=   1.3s
[CV 4/4] END C=16.0, gamma=scale, max_iter=100000;, score=(train=0.975, test=0.758) total time=   1.3s
[CV 1/4] END C=8.0, gamma=scale, max_iter=100000;, score=(train=0.947, test=0.714) total time=   1.3s
[CV 2/4] END C=8.0, gamma=scale, max_iter=100000;, score=(train=0.949, tes

In [4]:
model_summary

[['AdaBoost',
  0.6995610041623309,
  AdaBoostClassifier(algorithm='SAMME', n_estimators=400, random_state=54321),
  {'learning_rate': 1.0, 'n_estimators': 400}],
 ['DecisionTree',
  0.7383584807492196,
  DecisionTreeClassifier(random_state=54321),
  {'max_features': None, 'min_samples_split': 2}],
 ['GradientBoost',
  0.898466224419008,
  GradientBoostingClassifier(learning_rate=0.2, random_state=54321),
  {'learning_rate': 0.2, 'n_estimators': 100}],
 ['KNN',
  0.5797964360041623,
  KNeighborsClassifier(algorithm='ball_tree', n_neighbors=20),
  {'algorithm': 'ball_tree', 'n_neighbors': 20, 'weights': 'uniform'}],
 ['LinearSVM',
  0.7781829908081859,
  SVC(C=0.005, kernel='linear', max_iter=100000, probability=True,
      random_state=54321),
  {'C': 0.005, 'max_iter': 100000}],
 ['MLP',
  0.6972150862816511,
  MLPClassifier(alpha=1.0, early_stopping=True,
                hidden_layer_sizes=[250, 250, 250, 250], max_iter=100,
                random_state=54321),
  {'alpha': 1.0, 'hidd