In [11]:
import pandas as pd
import numpy as np
import pickle

In [12]:
# Paths to the two original files of dataset UJIIndoorloc. Change if needed.

work_path = './converted_datasets/UJI1'

dataframe = pd.read_csv('./raw_datasets/UJI1/trainingData.csv', na_values = '?')
dataframe['set'] = 'training'

tmp = pd.read_csv('./raw_datasets/UJI1/validationData.csv', na_values = '?')
tmp['set'] = 'test'

In [13]:
# Combining training and test data into a single dataframe

dataframe = pd.concat([dataframe, tmp]).reset_index(drop=True)

dataframe['SITE'] = [str(sp) + "_" + str(dataframe['RELATIVEPOSITION'][idx]) for idx, sp in enumerate(dataframe['SPACEID'])]
dataframe['tile'] = [str(sp) + "_" + str(dataframe['RELATIVEPOSITION'][idx]) + "_tile" for idx, sp in enumerate(dataframe['SPACEID'])]


dataframe.columns = [x.replace('lat', 'LATITUDE').replace('lon', 'LONGITUDE').replace('wap', 'WAP') for x in dataframe.columns]

dataframe

Unnamed: 0,WAP1,WAP2,WAP3,WAP4,WAP5,WAP6,WAP7,WAP8,WAP9,WAP10,...,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP,set,SITE,tile
0,100,100,100,100,100,100,100,100,100,100,...,2,1,106,2,2,23,1371713733,training,106_2,106_2_tile
1,100,100,100,100,100,100,100,100,100,100,...,2,1,106,2,2,23,1371713691,training,106_2,106_2_tile
2,100,100,100,100,100,100,100,-97,100,100,...,2,1,103,2,2,23,1371714095,training,103_2,103_2_tile
3,100,100,100,100,100,100,100,100,100,100,...,2,1,102,2,2,23,1371713807,training,102_2,102_2_tile
4,100,100,100,100,100,100,100,100,100,100,...,0,0,122,2,11,13,1369909710,training,122_2,122_2_tile
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21043,100,100,100,100,100,100,100,100,100,100,...,3,2,0,0,0,13,1381156711,test,0_0,0_0_tile
21044,100,100,100,100,100,100,100,100,100,100,...,3,2,0,0,0,13,1381156730,test,0_0,0_0_tile
21045,100,100,100,100,100,100,100,100,100,100,...,0,0,0,0,0,13,1381247781,test,0_0,0_0_tile
21046,100,100,100,100,100,100,100,100,100,100,...,0,0,0,0,0,13,1381247807,test,0_0,0_0_tile


In [14]:
# In UJIIndoorloc, fingerprints coming from a test data are not associated to a site
# Also, fingerprints only consider 2D coordinates

dataframe[[x for x in dataframe.columns]] = dataframe[[x for x in dataframe.columns]].replace(100, 'NULL')
dataframe.loc[dataframe.set == 'test','SITE'] = 'NULL'
dataframe.loc[dataframe.set == 'test','tile'] = 'NULL'
dataframe['coord_z'] = 'NULL'

In [15]:
# Changing the arrangement of the dataframe columns according to our needs

dataframe.reset_index(inplace=True)
dataframe = dataframe[['index'] + [x for x in dataframe.columns if 'WAP' in x] + ['LONGITUDE', 'LATITUDE', 'coord_z', 'FLOOR', 'BUILDINGID', 'SITE', 'tile', 'USERID', 'PHONEID', 'TIMESTAMP', 'set']]
dataframe.columns = ['fingerprint_id'] + [x for x in dataframe.columns if 'WAP' in x] + ['coord_x', 'coord_y', 'coord_z', 'floor', 'building', 'site', 'tile', 'user_id', 'device_id', 'epoch', 'set']

In [16]:
dataframe

Unnamed: 0,fingerprint_id,WAP1,WAP2,WAP3,WAP4,WAP5,WAP6,WAP7,WAP8,WAP9,...,coord_y,coord_z,floor,building,site,tile,user_id,device_id,epoch,set
0,0,,,,,,,,,,...,4.864921e+06,,2,1,106_2,106_2_tile,2,23,1371713733,training
1,1,,,,,,,,,,...,4.864934e+06,,2,1,106_2,106_2_tile,2,23,1371713691,training
2,2,,,,,,,,-97,,...,4.864950e+06,,2,1,103_2,103_2_tile,2,23,1371714095,training
3,3,,,,,,,,,,...,4.864934e+06,,2,1,102_2,102_2_tile,2,23,1371713807,training
4,4,,,,,,,,,,...,4.864982e+06,,0,0,122_2,122_2_tile,11,13,1369909710,training
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21043,21043,,,,,,,,,,...,4.864796e+06,,3,2,,,0,13,1381156711,test
21044,21044,,,,,,,,,,...,4.864792e+06,,3,2,,,0,13,1381156730,test
21045,21045,,,,,,,,,,...,4.864903e+06,,0,0,,,0,13,1381247781,test
21046,21046,,,,,,,,,,...,4.864905e+06,,0,0,,,0,13,1381247807,test


In [6]:
# Generating the file places.csv

places = dataframe[['building', 'floor', 'site']].copy().drop_duplicates()
places.reset_index(inplace=True, drop=True)
places = places[(places['site'] != 'NULL')]
places['floor_number'] = places['floor']
places = places[['building', 'floor', 'floor_number', 'site']]
places['site_height'] = 'NULL'
places['site_area'] = 'NULL'
places['floor_height'] = 'NULL'
places['floor_area'] = 'NULL'
places['building_area'] = 'NULL'
#places['description'] = 'NULL'

for col in places.columns:
    places[col] = places[col].astype(object)

places.to_csv(work_path + '/places.csv', sep=',', na_rep='NULL', index=False)

places

Unnamed: 0,building,floor,floor_number,site,site_height,site_area,floor_height,floor_area,building_area
0,1,2,2,106_2,,,,,
1,1,2,2,103_2,,,,,
2,1,2,2,102_2,,,,,
3,0,0,0,122_2,,,,,
4,1,2,2,105_2,,,,,
...,...,...,...,...,...,...,...,...,...
900,1,3,3,6_1,,,,,
901,1,3,3,11_2,,,,,
902,1,3,3,14_2,,,,,
903,1,3,3,3_1,,,,,


In [7]:
# Generating the file tessellations.csv

tessellations = dataframe[['building', 'floor', 'site', 'tile']].copy().drop_duplicates()
tessellations['tessellation_type'] = 'logical'
tessellations['coord_a_x'] = 'NULL'
tessellations['coord_a_y'] = 'NULL'
tessellations['coord_b_x'] = 'NULL'
tessellations['coord_b_y'] = 'NULL'
tessellations['coord_c_x'] = 'NULL'
tessellations['coord_c_y'] = 'NULL'
tessellations['coord_d_x'] = 'NULL'
tessellations['coord_d_y'] = 'NULL'
tessellations = tessellations[(tessellations['tile'] != 'NULL')]


for col in tessellations.columns:
    tessellations[col] = tessellations[col].astype(object)
    

tessellations.to_csv(work_path + '/tessellations.csv', sep=',', na_rep='NULL', index=False)

tessellations

Unnamed: 0,building,floor,site,tile,tessellation_type,coord_a_x,coord_a_y,coord_b_x,coord_b_y,coord_c_x,coord_c_y,coord_d_x,coord_d_y
0,1,2,106_2,106_2_tile,logical,,,,,,,,
2,1,2,103_2,103_2_tile,logical,,,,,,,,
3,1,2,102_2,102_2_tile,logical,,,,,,,,
4,0,0,122_2,122_2_tile,logical,,,,,,,,
5,1,2,105_2,105_2_tile,logical,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19315,1,3,6_1,6_1_tile,logical,,,,,,,,
19316,1,3,11_2,11_2_tile,logical,,,,,,,,
19318,1,3,14_2,14_2_tile,logical,,,,,,,,
19319,1,3,3_1,3_1_tile,logical,,,,,,,,


In [8]:
# Generating the file fingerprints.csv

fingerprints = dataframe[['fingerprint_id', 'coord_x', 'coord_y', 'coord_z', 'user_id', 'device_id', 'epoch', 'set', 'building', 'floor', 'site', 'tile']].copy()
fingerprints['tile_id'] = fingerprints.merge(tessellations, on=['building', 'floor', 'site', 'tile'], how='left', suffixes=('_1', ''))['tile'].fillna('NULL').values
fingerprints = fingerprints[['fingerprint_id', 'coord_x', 'coord_y', 'coord_z', 'building', 'floor', 'site', 'tile', 'user_id', 'device_id', 'epoch', 'set']]
fingerprints['user_id'] = [x if x != 0 else 'NULL' for x in fingerprints['user_id']]
fingerprints['is_radio_map'] = [True if x == 'training' else False for x in fingerprints['set']]
fingerprints['preceded_by'] = 'NULL'
fingerprints['followed_by'] = 'NULL'
fingerprints['notes'] = 'NULL'

for col in fingerprints.columns:
    fingerprints[col] = fingerprints[col].astype(object)
    

fingerprints.to_csv(work_path + '/fingerprints.csv', sep=',', na_rep='NULL', index=False)

fingerprints

Unnamed: 0,fingerprint_id,coord_x,coord_y,coord_z,building,floor,site,tile,user_id,device_id,epoch,set,is_radio_map,preceded_by,followed_by,notes
0,0,-7541.26,4.86492e+06,,1,2,106_2,106_2_tile,2,23,1371713733,training,True,,,
1,1,-7536.62,4.86493e+06,,1,2,106_2,106_2_tile,2,23,1371713691,training,True,,,
2,2,-7519.15,4.86495e+06,,1,2,103_2,103_2_tile,2,23,1371714095,training,True,,,
3,3,-7524.57,4.86493e+06,,1,2,102_2,102_2_tile,2,23,1371713807,training,True,,,
4,4,-7632.14,4.86498e+06,,0,0,122_2,122_2_tile,11,13,1369909710,training,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21043,21043,-7317.34,4.8648e+06,,2,3,,,,13,1381156711,test,False,,,
21044,21044,-7313.73,4.86479e+06,,2,3,,,,13,1381156730,test,False,,,
21045,21045,-7637.54,4.8649e+06,,0,0,,,,13,1381247781,test,False,,,
21046,21046,-7636.65,4.8649e+06,,0,0,,,,13,1381247807,test,False,,,


In [9]:
# Generating the file wifi_obs.csv

wifi_obs = dataframe[[x for x in dataframe.columns if 'WAP' in x]].copy()
wifi_obs['fingerprint_id'] = dataframe['fingerprint_id']
wifi_obs = wifi_obs[['fingerprint_id'] + [x for x in dataframe.columns if 'WAP' in x]]

for col in wifi_obs.columns:
    wifi_obs[col] = wifi_obs[col].astype(object)
    
wifi_obs.columns = ["AP-" + x + "-NULL" if "WAP" in x else x for x in list(wifi_obs.columns)]

wifi_obs.to_csv(work_path + '/wifi_obs.csv', sep=',', na_rep='NULL', index=False)

wifi_obs

Unnamed: 0,fingerprint_id,AP-WAP1-NULL,AP-WAP2-NULL,AP-WAP3-NULL,AP-WAP4-NULL,AP-WAP5-NULL,AP-WAP6-NULL,AP-WAP7-NULL,AP-WAP8-NULL,AP-WAP9-NULL,...,AP-WAP511-NULL,AP-WAP512-NULL,AP-WAP513-NULL,AP-WAP514-NULL,AP-WAP515-NULL,AP-WAP516-NULL,AP-WAP517-NULL,AP-WAP518-NULL,AP-WAP519-NULL,AP-WAP520-NULL
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,2,,,,,,,,-97,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21043,21043,,,,,,,,,,...,,,,,,,,,,
21044,21044,,,,,,,,,,...,,,,,,,,,,
21045,21045,,,,,,,,,,...,,,,,,,,,,
21046,21046,,,,,,,,,,...,,,,,,,,,,


In [None]:
adjacences = pd.read_csv('./raw_datasets/UJI1/adjacencesData.csv', sep=',', dtype=object)
adjacences['tile_1_site'] = adjacences['tile_1_space_id'] + '_' + adjacences['tile_1_relative_position']
adjacences['tile_2_site'] = adjacences['tile_2_space_id'] + '_' + adjacences['tile_2_relative_position']
adjacences['tile_1_tile'] = adjacences['tile_1_site'] + '_tile'
adjacences['tile_2_tile'] = adjacences['tile_2_site'] + '_tile'
adjacences = adjacences[['tile_1_building', 'tile_1_floor', 'tile_1_site', 'tile_1_tile', 'tile_2_building', 'tile_2_floor', 'tile_2_site', 'tile_2_tile', 'walkable', 'cost']]
adjacences['cost'] = 'NULL'
display(adjacences)


for col in adjacences.columns:
    adjacences[col] = adjacences[col].astype(object)


adjacences.to_csv(work_path + '/adjacences.csv', sep=',', index=False)