# Deep Learning for Predicting Indoor Location Using WiFi Fingerprinting
Ha Vu Tran

In [1]:
# necessary Libraries
import numpy as np
import pandas as pd
import time
import pprint

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
# magic word for producing visualizations in notebook
get_ipython().run_line_magic('matplotlib', 'inline')

#Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from scipy.sparse import lil_matrix

#Scoring Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error


import tensorflow as tf
from keras.models import Sequential
from keras.layers import *

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


# Preprocess Data

In [2]:
df = pd.read_csv("challenge1.csv")

#df.head(10)

In [3]:
#df.describe()

In [4]:

#Drop unneeded data
df.drop(['Unnamed: 0', 'USERID', 'PHONEID', 'TIMESTAMP'], axis = 1, inplace=True)

#Remove "NaN" value
col = df.columns[0:520]
for i in col:
    df[i].fillna(0, inplace=True)
df.dropna(subset=['LONGITUDE','LATITUDE', 'FLOOR', 'BUILDINGID' ], inplace=True)
#trainingData.isnull().sum()



#Process "WAP" data
df.iloc[:, 0:520] = np.where(df.iloc[:, 0:520] <= 0, 
                        df.iloc[:, 0:520] + 105, 
                        df.iloc[:, 0:520] - 100)

#Process Longtitude
df.iloc[:, 520] = np.where(df.iloc[:, 520] <= 0, 
                        -df.iloc[:, 520], 
                        df.iloc[:, 520])


df.describe()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,LONGITUDE,LATITUDE,BUILDINGID
count,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,...,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0,19159.0
mean,0.95172,0.965082,1.145415,1.150895,0.979644,1.396263,1.750822,1.821859,1.907198,1.182891,...,1.13889,1.103868,5.919202,7.535832,1.1534,1.048176,1.161856,7464.202052,4864871.0,1.213581
std,9.907839,9.94587,10.907006,10.93278,9.975353,10.995386,11.424759,11.274403,11.739711,10.859246,...,10.38636,10.499751,16.221807,16.427428,10.88876,10.432185,10.984137,123.311468,66.96052,0.832702
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7300.81899,4864746.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7359.1485,4864821.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7423.0609,4864852.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7594.2641,4864930.0,2.0
max,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,...,105.0,105.0,105.0,105.0,105.0,105.0,105.0,7691.3384,4865017.0,2.0


In [5]:

min_LGT = 7300.818990
min_LAT = 4.864746e+06

df.iloc[:,520] = (df.iloc[:, 520] - min_LGT + 10)/1000
df.iloc[:,521] = (df.iloc[:, 521] - min_LAT + 10)/1000


In [6]:
def preprocess_data(df):
    """
    Separates trainingData into Features and Targets
    Will also be applied to validationData
    
    INPUT: Cleaned trainingData DataFrame
    OUTPUT: trainingData as Features and Targets
    """
    
    global X1
    global y1
    global X2
    global y2
    global X3
    global y3
    
    # split the data set into features and targets(Floor and BuildingID)
    X1 = df.drop(['LONGITUDE', 'LATITUDE', 'BUILDINGID','FLOOR'], axis=1)
    y1 = df[[ 'BUILDINGID']]
    
    X2 = df.drop(['LONGITUDE', 'LATITUDE','FLOOR'], axis=1)
    y2 = df[['FLOOR']]
    
    
    X3 = df.drop(['LONGITUDE', 'LATITUDE','FLOOR'], axis=1)
    y3 = df[[ 'LONGITUDE']]
    
    #create Dummies for the targets to feed into the model
    y1 = pd.get_dummies(data=y1, columns=[ 'BUILDINGID'])
    y2 = pd.get_dummies(data=y2, columns=[ 'FLOOR']) 
    X3 = pd.get_dummies(data=X3, columns=['BUILDINGID']) 
    
    return X1, y1, X2, y2, X3, y3

In [7]:
def split_data(X, y):
# TO AVOID OVERFITTING: Split the training data into training and testing sets 
    global X_train
    global X_test
    global y_train
    global y_test
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.3, 
                                                        random_state = 42,
                                                        shuffle=True)

    # Show the results of the split
    print("Training set has {} samples.".format(X_train.shape[0]))
    print("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [8]:
X1, y1, X2, y2, X3, y3 = preprocess_data(df)
X_train1, X_test1, y_train1, y_test1 = split_data(X1,y1)
X_train2, X_test2, y_train2, y_test2 = split_data(X2,y2)
X_train3, X_test3, y_train3, y_test3 = split_data(X3,y3)

Training set has 13411 samples.
Testing set has 5748 samples.
Training set has 13411 samples.
Testing set has 5748 samples.
Training set has 13411 samples.
Testing set has 5748 samples.


In [9]:
#Scale Data with Standard Scaler

scaler1 = StandardScaler()
scaler1.fit(X_train1)
    
# Apply transform to both the training set and the test set.
X_train1 = scaler1.transform(X_train1)
X_test1 = scaler1.transform(X_test1)

scaler2 = StandardScaler()
scaler2.fit(X_train2)
    
# Apply transform to both the training set and the test set.
X_train2 = scaler2.transform(X_train2)
X_test2 = scaler2.transform(X_test2)

scaler3 = StandardScaler()
scaler3.fit(X_train3)
    
# Apply transform to both the training set and the test set.
X_train3 = scaler3.transform(X_train3)
X_test3 = scaler3.transform(X_test3)


In [10]:
y_train1 = lil_matrix(y_train1).toarray()
y_test1 = lil_matrix(y_test1).toarray()
y_train2 = lil_matrix(y_train2).toarray()
y_test2 = lil_matrix(y_test2).toarray()
y_train3 = lil_matrix(y_train3).toarray()
y_test3 = lil_matrix(y_test3).toarray()

# Model Training 

In [11]:
start_time = time.time()

# Define the model
model_1 = Sequential()
model_1.add(Dense(50, input_dim=520, activation='relu'))
model_1.add(Dense(80, activation='relu'))
model_1.add(Dense(50, activation='relu'))
model_1.add(Dense(3, activation='softmax'))
model_1.compile(loss='binary_crossentropy', optimizer='adam')

# Train the model
model_1.fit(
    X_train1,
    y_train1,
    epochs=70,
    shuffle=True,
    verbose=2
)

predictions1 = np.round(model_1.predict(X_test1))

# accuracy
print("Accuracy of predicting buildings = ",accuracy_score(y_test1,predictions1))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Epoch 1/70
 - 2s - loss: 0.1184
Epoch 2/70
 - 1s - loss: 0.0233
Epoch 3/70
 - 1s - loss: 0.0077
Epoch 4/70
 - 1s - loss: 0.0020
Epoch 5/70
 - 1s - loss: 5.1251e-04
Epoch 6/70
 - 1s - loss: 1.8397e-04
Epoch 7/70
 - 1s - loss: 7.8302e-05
Epoch 8/70
 - 1s - loss: 4.1393e-05
Epoch 9/70
 - 1s - loss: 2.7416e-05
Epoch 10/70
 - 1s - loss: 1.9206e-05
Epoch 11/70
 - 1s - loss: 1.3813e-05
Epoch 12/70
 - 1s - loss: 1.0253e-05
Epoch 13/70
 - 1s - loss: 7.5874e-06
Epoch 14/70
 - 1s - loss: 5.7467e-06
Epoch 15/70
 - 1s - loss: 4.3924e-06
Epoch 16/70
 - 1s - loss: 3.3827e-06
Epoch 17/70
 - 1s - loss: 2.6166e-06
Epoch 18/70
 - 1s - loss: 2.0319e-06
Epoch 19/70
 - 1s - loss: 1.5768e-06
Epoch 20/70
 - 1s - loss: 1.2323e-06
Epoch 21/70
 - 1s - loss: 9.7495e-07
Epoch 22/70
 - 1s - loss: 7.7370e-07
Epoch 23/70
 - 1s - loss: 6.2623e-07
Epoch 24/70
 - 1s - loss: 5.0832e-07
Epoch 25/70
 - 1s - loss: 4.1790e-07
Epoch 26/70
 - 1s - loss: 3.4780e-07
Epoch 27/70
 - 1s - loss: 2.9307e-07
Epoch 28/70
 - 1s - loss: 

In [12]:
start_time = time.time()

# Define the model
model_2 = Sequential()
model_2.add(Dense(50, input_dim=521, activation='relu'))
model_2.add(Dense(80, activation='relu'))
model_2.add(Dense(50, activation='relu'))
model_2.add(Dense(5, activation='softmax'))
model_2.compile(loss='binary_crossentropy', optimizer='adam')

# Train the model
model_2.fit(
    X_train2,
    y_train2,
    epochs=100,
    shuffle=True,
    verbose=2
)

predictions2 = np.round(model_2.predict(X_test2))

# accuracy
print("Accuracy of predicting floors = ",accuracy_score(y_test2,predictions2))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Epoch 1/100
 - 2s - loss: 0.2926
Epoch 2/100
 - 1s - loss: 0.1516
Epoch 3/100
 - 1s - loss: 0.0954
Epoch 4/100
 - 1s - loss: 0.0600
Epoch 5/100
 - 1s - loss: 0.0347
Epoch 6/100
 - 1s - loss: 0.0214
Epoch 7/100
 - 1s - loss: 0.0125
Epoch 8/100
 - 1s - loss: 0.0150
Epoch 9/100
 - 2s - loss: 0.0157
Epoch 10/100
 - 2s - loss: 0.0163
Epoch 11/100
 - 1s - loss: 0.0090
Epoch 12/100
 - 2s - loss: 0.0043
Epoch 13/100
 - 1s - loss: 0.0029
Epoch 14/100
 - 2s - loss: 0.0022
Epoch 15/100
 - 1s - loss: 0.0043
Epoch 16/100
 - 1s - loss: 0.0169
Epoch 17/100
 - 1s - loss: 0.0168
Epoch 18/100
 - 1s - loss: 0.0084
Epoch 19/100
 - 1s - loss: 0.0028
Epoch 20/100
 - 1s - loss: 9.5103e-04
Epoch 21/100
 - 1s - loss: 4.5170e-04
Epoch 22/100
 - 1s - loss: 0.0013
Epoch 23/100
 - 1s - loss: 0.0056
Epoch 24/100
 - 1s - loss: 0.0160
Epoch 25/100
 - 1s - loss: 0.0080
Epoch 26/100
 - 1s - loss: 0.0043
Epoch 27/100
 - 1s - loss: 0.0026
Epoch 28/100
 - 1s - loss: 0.0018
Epoch 29/100
 - 1s - loss: 0.0023
Epoch 30/100
 -

In [13]:
predictions = np.hstack((predictions1, predictions2)) 
y_test = np.hstack((y_test1, y_test2))  
# accuracy
print("Total Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Total Accuracy =  0.8284620737647878
--- Run time: 2.22 mins ---


In [17]:
start_time = time.time()

# Define the model
model_3 = Sequential()
model_3.add(Dense(50, input_dim=523, activation='relu'))
model_3.add(Dense(80, activation='relu'))
model_3.add(Dense(50, activation='relu'))
model_3.add(Dense(1, activation='linear'))
model_3.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model_3.fit(
    X_train3,
    y_train3,
    epochs=100,
    shuffle=True,
    verbose=2
)

predictions3 = (model_3.predict(X_test3))

# accuracy
print("RMSE of predicting LONGTITUDE = ", mean_squared_error(y_test3[:,0]*1000,predictions3[:,0]*1000)**(0.5) )
#print("Accuracy of predicting LATITUDE = ", mean_squared_error(y_test3[:,1],predictions3[:,1]))


print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Epoch 1/100
 - 3s - loss: 0.0231
Epoch 2/100
 - 1s - loss: 0.0032
Epoch 3/100
 - 1s - loss: 0.0019
Epoch 4/100
 - 1s - loss: 0.0014
Epoch 5/100
 - 1s - loss: 0.0011
Epoch 6/100
 - 1s - loss: 8.1361e-04
Epoch 7/100
 - 1s - loss: 6.9446e-04
Epoch 8/100
 - 1s - loss: 5.6776e-04
Epoch 9/100
 - 1s - loss: 5.4293e-04
Epoch 10/100
 - 1s - loss: 5.0378e-04
Epoch 11/100
 - 1s - loss: 4.9295e-04
Epoch 12/100
 - 1s - loss: 4.5074e-04
Epoch 13/100
 - 1s - loss: 3.9059e-04
Epoch 14/100
 - 1s - loss: 3.5785e-04
Epoch 15/100
 - 1s - loss: 3.5069e-04
Epoch 16/100
 - 1s - loss: 3.5783e-04
Epoch 17/100
 - 1s - loss: 3.2465e-04
Epoch 18/100
 - 1s - loss: 2.9517e-04
Epoch 19/100
 - 1s - loss: 2.6478e-04
Epoch 20/100
 - 1s - loss: 2.6751e-04
Epoch 21/100
 - 1s - loss: 2.3837e-04
Epoch 22/100
 - 1s - loss: 2.2372e-04
Epoch 23/100
 - 1s - loss: 2.0705e-04
Epoch 24/100
 - 2s - loss: 2.0446e-04
Epoch 25/100
 - 1s - loss: 1.9677e-04
Epoch 26/100
 - 1s - loss: 1.7678e-04
Epoch 27/100
 - 1s - loss: 1.6693e-04
Epo