# Indoor localization using deep learning with DNN and KNN

## 1. Importing the packages

In [1]:
import pandas as pd
import numpy as np
import os
import xml.etree.ElementTree as ET
import tensorflow as tf
import torch
from sklearn.preprocessing import scale, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, classification_report
from sklearn.neighbors import KNeighborsRegressor
import geopy.distance
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


## 2. Importing the data

### 2.1. UJIIndoorLoc dataset
* Dataset: UJIIndoorLoc (https://archive.ics.uci.edu/ml/datasets/ujiindoorloc)
* 529 attributes and 19937 rows

In [4]:
# UJIIndoorloc dataset
dataset_ujloc = pd.read_csv("../../data_UJIndoorLoc/trainingData.csv")
validation_dataset_ujloc = pd.read_csv("../../data_UJIndoorLoc/validationData.csv")
dataset_ujloc.head(5)

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,100,100,100,100,100,100,100,100,100,100,...,100,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733
1,100,100,100,100,100,100,100,100,100,100,...,100,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691
2,100,100,100,100,100,100,100,-97,100,100,...,100,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095
3,100,100,100,100,100,100,100,100,100,100,...,100,-7524.5704,4864934.0,2,1,102,2,2,23,1371713807
4,100,100,100,100,100,100,100,100,100,100,...,100,-7632.1436,4864982.0,0,0,122,2,11,13,1369909710


In [5]:
# print(dataset.shape)
# print(dataset[dataset['BUILDINGID'] == 0].shape)
# print(dataset[dataset['BUILDINGID'] == 1].shape)
# print(dataset[dataset['BUILDINGID'] == 2].shape)
dataset_ujloc = dataset_ujloc[dataset_ujloc['BUILDINGID'] == 2]
validation_dataset_ujloc = validation_dataset_ujloc[validation_dataset_ujloc['BUILDINGID'] == 2]

### 2.2. PrecisLoc dataset

In [None]:
# PrecisLoc dataset
dataset_dir = "../../data_PrecisLoc/FINAL_PRECISLOC_DATASET"
dataset_scen_1 = os.path.join(dataset_dir, "Scenario_1", "11-05-07")
dataset_scen_1

In [None]:
for filename in os.listdir(dataset_scen_1):
    if filename.startswith("Sensor"):
        sensor_readings = filename
    if filename.startswith("ground"):
        ground_truth = filename

print(sensor_readings)
print(ground_truth)

In [None]:
sensor_read_scen_1 = os.path.join(dataset_scen_1, sensor_readings)
ground_truth_scen_1 = os.path.join(dataset_scen_1, ground_truth)
sensor_read_scen_1

In [None]:
# Getting the unique MAC addresses of APs
tree = ET.parse(sensor_read_scen_1)

ap_list = [] 
for r in tree.iter(tag='r'):
    ap_list.append(r.attrib['b'])

ap_set = set(ap_list)
ap_set

In [None]:
# Reading the Sensor Readings for the scenario 1, 11-05-07
tree = ET.parse(sensor_read_scen_1)

time_data_ap_dict = {}

for wr in tree.iter(tag = 'wr'):
    time_data_ap_dict[wr.attrib['st']] = wr.iter(tag='r')

# for key in time_data_dict:
#     print(key, ': ', time_data_dict[key])
#     for r in time_data_dict[key]:
#         print(r.attrib)
time_data_ap_dict

In [None]:
# Reading the Ground Truth for the scenario 1, 11-05-07
tree1 = ET.parse(ground_truth_scen_1)

time_location_dict = {}

for position in tree1.iter(tag = 'position'):
#     print(position.attrib['time'].split(':')[:-1])
    time_location_dict[position.attrib['time']] = position.attrib
time_location_dict

In [None]:
dict_for_df = {'lat': [],
               'long': [],
               'timestamp': []}

for timestamp, location in time_location_dict.items():
#     print('key: ', timestamp, ', location:', location)
    pass

## 3. Data vizualisation

In [None]:
# Making the bar chart for buildings
sns.displot(dataset_ujloc[['BUILDINGID']], kde=False)

In [None]:
# Making the bar chart for floors
sns.displot(dataset_ujloc[['FLOOR']],kde=False)

In [None]:
# The scatter plot of the available data points | 14,700 m2
markers = ('s', 'X', 'o')
colors = ('red', 'yellow', 'lightgreen')
cmap = ListedColormap(colors[:len(np.unique(dataset_ujloc['BUILDINGID']))])


fig = plt.figure(figsize=(10,10))
for idx, cl in enumerate(np.unique(dataset_ujloc['BUILDINGID'])):
        plt.scatter(x=dataset_ujloc.loc[dataset_ujloc.BUILDINGID== cl]['LATITUDE'], 
                    y=dataset_ujloc.loc[dataset_ujloc.BUILDINGID== cl]['LONGITUDE'],
                    alpha=0.6, 
                    c=[cmap(idx)],
                    edgecolor='black',
                    marker=markers[idx], 
                    label=cl)

plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.legend(loc='upper right')
plt.tight_layout()

In [None]:
# Buildings 0/1 have 4 floors, building 2 has 5
sns.countplot(x="FLOOR", hue="BUILDINGID", data=dataset_ujloc, orient="v")

In [None]:
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection='3d')
xs = dataset_ujloc['LONGITUDE']
ys = dataset_ujloc['LATITUDE']
zs = dataset_ujloc['FLOOR']
ax.scatter(xs, ys, zs, alpha=0.5, s=10,marker='o')

plt.title("Location points with respect to the floor")

plt.xlabel('Latitude')
plt.ylabel('Longitude')

plt.show()

## 4. Data preprocessing

In [None]:
# Removing the unused columns
dataset_ujloc.drop(["SPACEID", "RELATIVEPOSITION", "USERID", "PHONEID", "TIMESTAMP"], axis=1, inplace=True)
validation_dataset_ujloc.drop(["SPACEID", "RELATIVEPOSITION", "USERID", "PHONEID", "TIMESTAMP"], axis=1, inplace=True)

In [None]:
dataset_ujloc

### 4.1. Dropping BUILDINGID for cases where we train on a single building

In [None]:
# Removing the building (for scenario with only a building selected)
# dataset_ujloc.drop(["BUILDINGID"], axis=1, inplace=True)
# validation_dataset_ujloc.drop(["BUILDINGID"], axis=1, inplace=True)

In [None]:
# dataset_ujloc.head(5)

### 4.2. Encoding a POINT column (FLOOR+BUILDINGID) for cases of multi-building classification

In [None]:
# Creating the output (target) column for DNN (For the case of FLOOR+Building // otherwise, just floor is enough (for only 1 building seleted))
# dataset_ujloc['POINT'] = dataset_ujloc['FLOOR'].astype(str) + dataset_ujloc['BUILDINGID'].astype(str)
# validation_dataset_ujloc['POINT'] = validation_dataset_ujloc['FLOOR'].astype(str) + \
#                                     validation_dataset_ujloc['BUILDINGID'].astype(str)

In [None]:
# dataset_ujloc.head(5)

In [None]:
# Encoding the FLOOR+BUILDING (=POINT)
# floor_build_encoder = LabelEncoder()
# floor_build_encoder.fit(dataset_ujloc["POINT"])

In [None]:
# floor_build_encoder = floor_build_encoder.transform(dataset_ujloc['POINT'])

In [None]:
# df_point_encoded = pd.DataFrame(floor_build_encoder, columns=['POINT_ENCODED'])

In [None]:
# dataset_ujloc = pd.concat([dataset_ujloc, df_point_encoded], axis=1)

In [None]:
# dataset_ujloc

## ----------------------- Up until here the dataset and validation are the same -------------------

### 4.3. Splitting the data in labels and targets for the first DNN (Step 1)

In [None]:
X = dataset_ujloc.iloc[:, :520].values
y = dataset_ujloc['FLOOR'].values

In [None]:
X

In [None]:
y

### 4.4. Splitting the dataset into Training and Test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train

### 4.5. Feature scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) 

In [None]:
X_train

In [None]:
y_train

## 5. Building the first DNN model

### 5.1. Initializing the NN

In [None]:
 nn = tf.keras.models.Sequential()

### 5.2. Adding the input layer and the first hidden layer

In [None]:
# Adding a fully-connected layer (it automatically create the input layer)
# num of neurons = num of features (520 APs)
input_size = 520
nn.add(tf.keras.layers.Dense(input_dim=input_size, units=256, activation='relu'))

### 5.3. Adding the second layer

In [None]:
# Adding a layer with 128 neurons
nn.add(tf.keras.layers.Dense(units=128, activation='relu'))

### 5.4. Adding the output layer

In [None]:
# sotmax for > 2 categories
classes = 5
nn.add(tf.keras.layers.Dense(units=classes, activation='softmax'))

## 6. Training the DNN

### 6.1. Compiling the DNN

In [None]:
# Optimizer = Adam ==> Stochastic Gradient Descent
nn.compile(optimizer = 'adam',
           loss = 'sparse_categorical_crossentropy',
           metrics = ['accuracy'])

### 6.2. Training the ANN on the Training set

In [None]:
nn.fit(X_train, y_train, batch_size=32, epochs=20)

## 7. Making predictions and evaluating the model

### 7.1. Predicting the Test set results

In [None]:
y_pred = nn.predict(X_test)

In [None]:
y_pred

In [None]:
y_pred.shape

#### 7.1.1. Transforming the encoded data back to floor number

In [None]:
# we transform  the (1899, 5) floor array back to single value (the number of the floor)

# floor_pred = []
# floors = [i for i in range(5)]
# for arr in y_pred:
#     max_val = max(arr)
#     index_max = np.where(arr == max_val)
#     floor_pred.append(floors[index_max[0][0]])
# floor_pred = np.array(floor_pred)

floor_pred = tf.argmax(y_pred, axis=1)
floor_pred = tf.keras.backend.eval(floor_pred)

In [None]:
floor_pred

In [None]:
y_test

### 7.2. Making the confusion matrix and calculating Accuracy 

In [None]:
cm = confusion_matrix(y_test, floor_pred)
print(cm)
accuracy = accuracy_score(y_test, floor_pred)
print('Accuracy: ', accuracy)

In [None]:
print(classification_report(y_test, floor_pred))

### 7.3. Predicting with the Validation set

In [None]:
X_valid = validation_dataset_ujloc.iloc[:, :520].values
y_valid = validation_dataset_ujloc['FLOOR'].values

In [None]:
X_valid = sc.fit_transform(X_valid)

In [None]:
y_pred_valid = nn.predict(X_valid)
y_pred_valid

In [None]:
y_pred_valid.shape

In [None]:
floor_pred_valid = tf.argmax(y_pred_valid, axis=1)
floor_pred_valid = tf.keras.backend.eval(floor_pred_valid)

In [None]:
floor_pred_valid

In [None]:
cm = confusion_matrix(y_valid, floor_pred_valid)
print(cm)
accuracy = accuracy_score(y_valid, floor_pred_valid)
print('Accuracy: ', accuracy)

In [None]:
print(classification_report(y_valid, floor_pred_valid))

## 8. Building the KNN regressor

### 8.1. Creating the second training dataset

#### 8.1.1. Selecting the cluster to which out values correspond to (the Floor)

In [None]:
floor = 3
# floor = floor_pred_valid[0] # TODO: verify how can we change this so it gets the floor by default
mask = dataset_ujloc['FLOOR'] == floor

In [None]:
df_ap = dataset_ujloc.iloc[:, :520][mask]
df_floor = dataset_ujloc['FLOOR'][mask]

In [None]:
X_step_2 = pd.concat([df_ap, df_floor], axis=1).values
y_step_2 = dataset_ujloc[['LONGITUDE', 'LATITUDE']][mask].values

In [None]:
X_step_2

In [None]:
y_step_2

### 8.3. Splitting the data in Train and Test

In [None]:
X_train_step_2, X_test_step_2, y_train_step_2, y_test_step_2 = train_test_split(X_step_2,
                                                                                y_step_2, 
                                                                                test_size = 0.2, 
                                                                                random_state = 0)

In [None]:
X_train_step_2

In [None]:
len(X_train_step_2)

### 8.4. Building the model

In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=3, weights='distance')
knn_regressor.fit(X_train_step_2, y_train_step_2)

### 8.5. Making predictions and evaluating the model

In [None]:
y_pred_step_2 = knn_regressor.predict(X_test_step_2)
y_pred_step_2

In [None]:
y_pred_step_2.shape

In [None]:
y_test_step_2.shape

In [None]:
print(mean_squared_error(y_test_step_2, y_pred_step_2))

In [None]:
print(mean_squared_error(y_test_step_2, y_pred_step_2, multioutput='raw_values'))

In [None]:
y_pred_step_2 = pd.DataFrame(y_pred_step_2, columns=['long1', 'lat1'])
y_test_step_2 = pd.DataFrame(y_test_step_2, columns=['long2', 'lat2'])
# alculating the Euclidian distance between points
distance_df = pd.DataFrame((y_test_step_2['lat2']-y_pred_step_2['lat1'])**2 + 
                           (y_test_step_2['long2']-y_pred_step_2['long1'])**2)**(1/2)
sum_dist = distance_df.sum()
mean_error = sum_dist/len(distance_df)
mean_error

In [None]:
# from math import sin, cos, sqrt, atan2, radians

# # Approximate radius of earth in km
# R = 6373.0

# def calc_dist(lat1, lon1, lat2, lon2):
#     lat1 = radians(lat1)
#     lon1 = radians(lon1)
#     lat2 = radians(lat2)
#     lon2 = radians(lon2)

#     dlon = lon2 - lon1
#     dlat = lat2 - lat1

#     a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
#     c = 2 * atan2(sqrt(a), sqrt(1 - a))

#     distance = R * c
#     return distance

# print("Result: ", distance)
# print("Should be: ", 278.546, "km")

In [None]:
df_final = pd.DataFrame()
df_final = pd.concat([y_pred_step_2, y_test_step_2], axis=1)
df_final

In [None]:
# earth_radius=6371

# df_final['long1'] = np.radians(df_final['long1'])
# df_final['long2'] = np.radians(df_final['long2'])
# df_final['lat1'] = np.radians(df_final['lat1'])
# df_final['lat2'] = np.radians(df_final['lat2'])


# df_final['a'] = np.sin((df_final['lat2']-df_final['lat1'])/2.0)**2 + \
#     np.cos(df_final['lat1']) * np.cos(df_final['lat2']) * np.sin((df_final['long2']-df_final['long1'])/2.0)**2

# df_final['DISTANCE'] = earth_radius * 2 * np.arcsin(np.sqrt(df_final['a']))
# dist_km = df_final['DISTANCE'].sum().mean()
# dist_m = dist_km
# dist_m

In [None]:
plt.figure(figsize=(15,15))
plt.scatter(df_final['lat1'], df_final['long1'], color='red')
plt.scatter(df_final['lat2'], df_final['long2'], color='green', alpha=0.4)