In [37]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

# Appendix - More variables introduced

In [38]:
# Data reading and processing
import re
import pandas as pd
import numpy as np
import warnings
from scipy.stats import randint as sp_randint
from datetime import datetime

import os
from os import path

# Visualisation
from matplotlib import pyplot as plt
from pandas_ml import ConfusionMatrix

# Modelling
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputRegressor

from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score, mean_squared_error

# Suppress warnings
warnings.filterwarnings("ignore")

In [39]:
# Setting current directory as same directory where code is stored
try:
    CUR_DIR = dirname(realpath(__file__))
except NameError:
    CUR_DIR = os.getcwd()

In [74]:
## Calculating distance        
def pythagoras(long1, long2, lat1, lat2):
    import math
    a = abs(long1-long2)**2
    b = abs(lat1-lat2)**2
    return math.sqrt(a+b)

## Data filtering and processing

In [41]:
# Import data files
train = pd.DataFrame.from_csv(path.join(CUR_DIR, 'trainingData.csv'),index_col=None)
test = pd.DataFrame.from_csv(path.join(CUR_DIR, 'validationData.csv'),index_col=None)

print('Training dataset (length, width) = %s' % str(train.shape))
print('Validation dataset (length, width) = %s' % str(test.shape))

Training dataset (length, width) = (19937, 529)
Validation dataset (length, width) = (1111, 529)


In [42]:
sub_train = train[[x for x in train.columns if 'WAP' not in x]]
sub_train['max_signal'] = train.loc[:,[x for x in train.columns if 'WAP' in x]].replace(100,np.NaN).max(axis=1)
nan_sub = sub_train.loc[np.isnan(sub_train['max_signal']),]
nan_rows = sub_train['max_signal'].isnull().sum()
nan_rows_index = nan_sub.index.tolist()
train = train.iloc[[x for x in train.index if x not in nan_rows_index]]
wap_max = train[[x for x in train.columns if 'WAP' in x]].replace(100,np.nan).max(axis=0).reset_index()
wap_max = wap_max.rename(columns={'index':'wap_name', 0: 'max_signal'}).fillna(10)
cols_torm = wap_max.loc[wap_max['max_signal']==10,'wap_name'].tolist()
train = train.drop(train[cols_torm],axis=1)
test = test.drop(test[cols_torm],axis=1)

# Creates single variable combining both targets
train['target'] = 'B' + train['BUILDINGID'].astype(str) + ' x F' + train['FLOOR'].astype(str)
test['target'] = 'B' + test['BUILDINGID'].astype(str) + ' x F' + test['FLOOR'].astype(str)

## Data Modelling

## Optimised Classification Model

In [43]:
class_rf = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
class_rf.fit(train[[x for x in train.columns if 'WAP' in x]], train['target'].ravel())
class_preds = class_rf.predict(test[[x for x in train.columns if 'WAP' in x]])

class_acc = accuracy_score(test['target'].ravel(), class_preds)
print("Accuracy: %.2f%%" % (class_acc*100))

actual_class = test['target'].tolist()
bldg_results = []
floor_results = []
for i, pred in enumerate(class_preds):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results.append(pred[0] == act[0])
    floor_results.append(pred[1] == act[1])


pred_bldg = sum(bldg_results)/len(bldg_results)
pred_floor = sum(floor_results)/len(floor_results)
print("\tBuilding prediction accuracy: %.2f%%" % (pred_bldg*100))
print("\tFloor prediction accuracy: %.2f%%" % (pred_floor*100))

Accuracy: 88.66%
	Building prediction accuracy: 100.00%
	Floor prediction accuracy: 88.66%


## Optimised Regression Model

In [44]:
reg_knn = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn.fit(train[[x for x in train.columns if 'WAP' in x]], train[['LATITUDE','LONGITUDE']].values)
reg_preds = reg_knn.predict(test[[x for x in train.columns if 'WAP' in x]])
reg_preds = tuple(map(tuple, reg_preds))

rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds)
print('R-squared score: %.4f' % rsq)

actual_ll = tuple(map(tuple, test[['LATITUDE','LONGITUDE']].values))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius = sum(dist_results) / float(len(dist_results))
print("Accuracy radius: %.2fm" % pred_radius)

R-squared score: 0.9779
	Accuracy radius: 12.94m


## Adding variables to modelling

In [45]:
train['time_weekday'] = train['TIMESTAMP'].apply(lambda x: datetime.fromtimestamp(x).strftime("%w"))
train['time_hour'] = train['TIMESTAMP'].apply(lambda x: datetime.fromtimestamp(x).strftime("%H"))

test['time_weekday'] = test['TIMESTAMP'].apply(lambda x: datetime.fromtimestamp(x).strftime("%w"))
test['time_hour'] = test['TIMESTAMP'].apply(lambda x: datetime.fromtimestamp(x).strftime("%H"))

### 1. Time of day

In [46]:
rf_tuned = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
rf_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'time_hour']], train['target'].ravel())
class_preds_tuned = rf_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'time_hour']])
print("Classification Model -")
pred_class_acc = accuracy_score(test['target'].ravel(), class_preds_tuned)
print("\tOverall accuracy score: %.2f%% (%.2f%% from original)" % ((pred_class_acc*100),((pred_class_acc-class_acc)/class_acc*100)))

bldg_results_new = []
floor_results_new = []
for i, pred in enumerate(class_preds_tuned):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results_new.append(pred[0] == act[0])
    floor_results_new.append(pred[1] == act[1])

pred_bldg_new = sum(bldg_results_new)/len(bldg_results_new)
pred_floor_new = sum(floor_results_new)/len(floor_results_new)
print("\tBuilding prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_bldg_new*100), ((pred_bldg_new-pred_bldg)/pred_bldg*100)))
print("\tFloor prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_floor_new*100), ((pred_floor_new-pred_floor)/pred_floor*100)))

reg_knn_tuned = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'time_hour']], train[['LATITUDE','LONGITUDE']].values)
reg_preds_tuned = reg_knn_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'time_hour']])

print("Regression Model -")
new_rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds_tuned)
print('\tR-squared score: %.4f (%.2f%% from original)' % (new_rsq,((new_rsq-rsq)/rsq*100)))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius_new = sum(dist_results) / float(len(dist_results))
print("\tAccuracy radius: %.2fm (%.2f%% from original)" % (pred_radius_new, ((pred_radius-pred_radius_new)/pred_radius)*100))

Classification Model -
	Overall accuracy score: 81.28% (-8.32% from original)
	Building prediction accuracy: 99.73% (-0.27% from original)
	Floor prediction accuracy: 81.28% (-8.32% from original)
Regression Model -
	R-squared score: 0.9780 (0.01% from original)
	Accuracy radius: 12.94m (0.00% from original)


### 2. Day of Week

In [53]:
rf_tuned = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
rf_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'time_weekday']], train['target'].ravel())
class_preds_tuned = rf_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'time_weekday']])
print("Classification Model - ")
pred_class_acc = accuracy_score(test['target'].ravel(), class_preds_tuned)
print("\tOverall accuracy score: %.2f%% (%.2f%% from original)" % ((pred_class_acc*100),((pred_class_acc-class_acc)/class_acc*100)))

bldg_results_new = []
floor_results_new = []
for i, pred in enumerate(class_preds_tuned):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results_new.append(pred[0] == act[0])
    floor_results_new.append(pred[1] == act[1])

pred_bldg_new = sum(bldg_results_new)/len(bldg_results_new)
pred_floor_new = sum(floor_results_new)/len(floor_results_new)
print("\tBuilding prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_bldg_new*100), ((pred_bldg_new-pred_bldg)/pred_bldg*100)))
print("\tFloor prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_floor_new*100), ((pred_floor_new-pred_floor)/pred_floor*100)))

reg_knn_tuned = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'time_weekday']], train[['LATITUDE','LONGITUDE']].values)
reg_preds_tuned = reg_knn_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'time_weekday']])

print("Regression Model -")
new_rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds_tuned)
print('\tR-squared score: %.4f (%.2f%% from original)' % (new_rsq,((new_rsq-rsq)/rsq*100)))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius_new = sum(dist_results) / float(len(dist_results))
print("\tAccuracy radius: %.2fm (%.2f%% from original)" % (pred_radius_new, ((pred_radius-pred_radius_new)/pred_radius)*100))

Classification Model - 
	Overall accuracy score: 86.50% (-2.44% from original)
	Building prediction accuracy: 99.91% (-0.09% from original)
	Floor prediction accuracy: 86.59% (-2.34% from original)
Regression Model -
	R-squared score: 0.9779 (0.00% from original)
	Accuracy radius: 12.94m (0.00% from original)


### 3. Phone Model

In [72]:
rf_tuned = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
rf_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'PHONEID']], train['target'].ravel())
class_preds_tuned = rf_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'PHONEID']])
print("Classification Model - ")
pred_class_acc = accuracy_score(test['target'].ravel(), class_preds_tuned)
print("\tOverall accuracy score: %.2f%% (%.2f%% from original)" % ((pred_class_acc*100),((pred_class_acc-class_acc)/class_acc*100)))

bldg_results_new = []
floor_results_new = []
for i, pred in enumerate(class_preds_tuned):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results_new.append(pred[0] == act[0])
    floor_results_new.append(pred[1] == act[1])

pred_bldg_new = sum(bldg_results_new)/len(bldg_results_new)
pred_floor_new = sum(floor_results_new)/len(floor_results_new)
print("\tBuilding prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_bldg_new*100), ((pred_bldg_new-pred_bldg)/pred_bldg*100)))
print("\tFloor prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_floor_new*100), ((pred_floor_new-pred_floor)/pred_floor*100)))

reg_knn_tuned = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'PHONEID']], train[['LATITUDE','LONGITUDE']].values)
reg_preds_tuned = reg_knn_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'PHONEID']])

print("Regression Model -")
new_rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds_tuned)
print('\tR-squared score: %.4f (%.2f%% from original)' % (new_rsq,((new_rsq-rsq)/rsq*100)))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius_new = sum(dist_results) / float(len(dist_results))
print("\tAccuracy radius: %.2fm (%.2f%% from original)" % (pred_radius_new, ((pred_radius-pred_radius_new)/pred_radius)*100))

Classification Model - 
	Overall accuracy score: 86.50% (-2.44% from original)
	Building prediction accuracy: 99.91% (-0.09% from original)
	Floor prediction accuracy: 86.50% (-2.44% from original)
Regression Model -
	R-squared score: 0.9779 (-0.00% from original)
	Accuracy radius: 12.94m (0.00% from original)


### 4. Phone Model + Time of Day

In [50]:
rf_tuned = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
rf_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_hour']], train['target'].ravel())
class_preds_tuned = rf_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_hour']])
print("Classification Model - ")
pred_class_acc = accuracy_score(test['target'].ravel(), class_preds_tuned)
print("\tOverall accuracy score: %.2f%% (%.2f%% from original)" % ((pred_class_acc*100),((pred_class_acc-class_acc)/class_acc*100)))

bldg_results_new = []
floor_results_new = []
for i, pred in enumerate(class_preds_tuned):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results_new.append(pred[0] == act[0])
    floor_results_new.append(pred[1] == act[1])

pred_bldg_new = sum(bldg_results_new)/len(bldg_results_new)
pred_floor_new = sum(floor_results_new)/len(floor_results_new)
print("\tBuilding prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_bldg_new*100), ((pred_bldg_new-pred_bldg)/pred_bldg*100)))
print("\tFloor prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_floor_new*100), ((pred_floor_new-pred_floor)/pred_floor*100)))

reg_knn_tuned = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_hour']], train[['LATITUDE','LONGITUDE']].values)
reg_preds_tuned = reg_knn_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_hour']])

print("Regression Model -")
new_rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds_tuned)
print('\tR-squared score: %.4f (%.2f%% from original)' % (new_rsq,((new_rsq-rsq)/rsq*100)))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius_new = sum(dist_results) / float(len(dist_results))
print("\tAccuracy radius: %.2fm (%.2f%% from original)" % (pred_radius_new, ((pred_radius-pred_radius_new)/pred_radius)*100))

Classification Model - 
	Overall accuracy score: 81.28% (-8.32% from original)
	Building prediction accuracy: 99.64% (-0.36% from original)
	Floor prediction accuracy: 81.28% (-8.32% from original)
Regression Model -
	R-squared score: 0.9780 (0.01% from original)
	Accuracy radius: 12.94m (0.00% from original)


### 5. Phone Model + Day of Week

In [51]:
rf_tuned = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
rf_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_weekday']], train['target'].ravel())
class_preds_tuned = rf_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_weekday']])
print("Classification Model - ")
pred_class_acc = accuracy_score(test['target'].ravel(), class_preds_tuned)
print("\tOverall accuracy score: %.2f%% (%.2f%% from original)" % ((pred_class_acc*100),((pred_class_acc-class_acc)/class_acc*100)))

bldg_results_new = []
floor_results_new = []
for i, pred in enumerate(class_preds_tuned):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results_new.append(pred[0] == act[0])
    floor_results_new.append(pred[1] == act[1])

pred_bldg_new = sum(bldg_results_new)/len(bldg_results_new)
pred_floor_new = sum(floor_results_new)/len(floor_results_new)
print("\tBuilding prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_bldg_new*100), ((pred_bldg_new-pred_bldg)/pred_bldg*100)))
print("\tFloor prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_floor_new*100), ((pred_floor_new-pred_floor)/pred_floor*100)))

reg_knn_tuned = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn_tuned.fit(train[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_weekday']], train[['LATITUDE','LONGITUDE']].values)
reg_preds_tuned = reg_knn_tuned.predict(test[[x for x in train.columns if 'WAP' in x or x == 'PHONEID' or x == 'time_weekday']])

print("Regression Model -")
new_rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds_tuned)
print('\tR-squared score: %.4f (%.2f%% from original)' % (new_rsq,((new_rsq-rsq)/rsq*100)))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius_new = sum(dist_results) / float(len(dist_results))
print("\tAccuracy radius: %.2fm (%.2f%% from original)" % (pred_radius_new, ((pred_radius-pred_radius_new)/pred_radius)*100))

Classification Model - 
	Overall accuracy score: 84.61% (-4.57% from original)
	Building prediction accuracy: 99.37% (-0.63% from original)
	Floor prediction accuracy: 84.70% (-4.47% from original)
Regression Model -
	R-squared score: 0.9779 (-0.00% from original)
	Accuracy radius: 12.94m (0.00% from original)


### 6. Time of Day + Day of Week

In [52]:
rf_tuned = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
rf_tuned.fit(train[[x for x in train.columns if 'WAP' in x or 'time_' in x]], train['target'].ravel())
class_preds_tuned = rf_tuned.predict(test[[x for x in train.columns if 'WAP' in x or 'time_' in x]])
print("Classification Model - ")
pred_class_acc = accuracy_score(test['target'].ravel(), class_preds_tuned)
print("\tOverall accuracy score: %.2f%% (%.2f%% from original)" % ((pred_class_acc*100),((pred_class_acc-class_acc)/class_acc*100)))

bldg_results_new = []
floor_results_new = []
for i, pred in enumerate(class_preds_tuned):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results_new.append(pred[0] == act[0])
    floor_results_new.append(pred[1] == act[1])

pred_bldg_new = sum(bldg_results_new)/len(bldg_results_new)
pred_floor_new = sum(floor_results_new)/len(floor_results_new)
print("\tBuilding prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_bldg_new*100), ((pred_bldg_new-pred_bldg)/pred_bldg*100)))
print("\tFloor prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_floor_new*100), ((pred_floor_new-pred_floor)/pred_floor*100)))

reg_knn_tuned = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn_tuned.fit(train[[x for x in train.columns if 'WAP' in x or 'time_' in x]], train[['LATITUDE','LONGITUDE']].values)
reg_preds_tuned = reg_knn_tuned.predict(test[[x for x in train.columns if 'WAP' in x or 'time_' in x]])

print("Regression Model -")
new_rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds_tuned)
print('\tR-squared score: %.4f (%.2f%% from original)' % (new_rsq,((new_rsq-rsq)/rsq*100)))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius_new = sum(dist_results) / float(len(dist_results))
print("\tAccuracy radius: %.2fm (%.2f%% from original)" % (pred_radius_new, ((pred_radius-pred_radius_new)/pred_radius)*100))

Classification Model - 
	Overall accuracy score: 83.08% (-6.29% from original)
	Building prediction accuracy: 99.55% (-0.45% from original)
	Floor prediction accuracy: 83.26% (-6.09% from original)
Regression Model -
	R-squared score: 0.9780 (0.01% from original)
	Accuracy radius: 12.94m (0.00% from original)


### 7. Combination of all 3

In [49]:
rf_tuned = RandomForestClassifier(random_state=20, n_jobs=-1, bootstrap=False, criterion='gini', max_depth=44, max_features='auto', min_samples_leaf=1,min_samples_split=3, n_estimators=1925)
rf_tuned.fit(train[[x for x in train.columns if 'WAP' in x or 'time_' in x or x == 'PHONEID']], train['target'].ravel())
class_preds_tuned = rf_tuned.predict(test[[x for x in train.columns if 'WAP' in x or 'time_' in x or x == 'PHONEID']])
print("Classification Model - ")
pred_class_acc = accuracy_score(test['target'].ravel(), class_preds_tuned)
print("\tOverall accuracy score: %.2f%% (%.2f%% from original)" % ((pred_class_acc*100),((pred_class_acc-class_acc)/class_acc*100)))

bldg_results_new = []
floor_results_new = []
for i, pred in enumerate(class_preds_tuned):
    pred = pred.split(' x ')
    act = actual_class[i].split(' x ')
    bldg_results_new.append(pred[0] == act[0])
    floor_results_new.append(pred[1] == act[1])

pred_bldg_new = sum(bldg_results_new)/len(bldg_results_new)
pred_floor_new = sum(floor_results_new)/len(floor_results_new)
print("\tBuilding prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_bldg_new*100), ((pred_bldg_new-pred_bldg)/pred_bldg*100)))
print("\tFloor prediction accuracy: %.2f%% (%.2f%% from original)" % ((pred_floor_new*100), ((pred_floor_new-pred_floor)/pred_floor*100)))

reg_knn_tuned = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3), n_jobs=-1)
reg_knn_tuned.fit(train[[x for x in train.columns if 'WAP' in x or 'time_' in x or x == 'PHONEID']], train[['LATITUDE','LONGITUDE']].values)
reg_preds_tuned = reg_knn_tuned.predict(test[[x for x in train.columns if 'WAP' in x or 'time_' in x or x == 'PHONEID']])

print("Regression Model -")
new_rsq = r2_score(test[['LATITUDE','LONGITUDE']].values, reg_preds_tuned)
print('\tR-squared score: %.4f (%.2f%% from original)' % (new_rsq,((new_rsq-rsq)/rsq*100)))

dist_results = []
for i, pred in enumerate(reg_preds):
    dist_results.append(pythagoras(lat1=pred[0],lat2=actual_ll[i][0],long1=pred[1],long2=actual_ll[i][1]))

pred_radius_new = sum(dist_results) / float(len(dist_results))
print("\tAccuracy radius: %.2fm (%.2f%% from original)" % (pred_radius_new, ((pred_radius-pred_radius_new)/pred_radius)*100))

Classification Model - 
	Overall accuracy score: 81.37% (-8.22% from original)
	Building prediction accuracy: 99.01% (-0.99% from original)
	Floor prediction accuracy: 81.55% (-8.02% from original)
Regression Model -
	R-squared score: 0.9780 (0.01% from original)
	Accuracy radius: 12.94m (0.00% from original)


### With the additional variables, the regression model largely remained the same while the classification model worsened in accuracy.
* None of the additional models could provide a 100% prediction for Building
* Day of Week and Phone Model (individually) provided the closest estimate to the original optimised model