In [85]:
import pandas as pd
import numpy as np
import pickle

In [86]:
# This first part of the procedure works for both DSI1 and DSI2 datasets
# Later in the code, there is the part to import DSI-trajectories

In [88]:
dataset_name = 'DSI1' # DSI1 # DSI2


work_path = './' + dataset_name

file = open('./raw_datasets/DSI/' + dataset_name + '/dataset.pickle', 'rb')
dataframe = pickle.load(file)
file.close()


dataframe['coord_z'] = 'NULL'
dataframe['building_id'] = '11'
dataframe['site'] = 'NULL'
dataframe['tile'] = 'NULL'
dataframe['user_id'] = 1
dataframe['device_id'] = 1
dataframe['epoch'] = 'NULL'

dataframe[[x for x in dataframe.columns]] = dataframe[[x for x in dataframe.columns]].replace(100, 'NULL')

dataframe = dataframe[[x for x in dataframe.columns if 'WAP' in x] + ['coord_x', 'coord_y', 'coord_z', 'floor_id', 'building_id', 'site', 'tile', 'user_id', 'device_id', 'epoch', 'set']]
dataframe.reset_index(inplace=True)

dataframe.columns = ['fingerprint_id'] + [x for x in dataframe.columns if 'WAP' in x] + ['coord_x', 'coord_y', 'coord_z', 'floor', 'building', 'site', 'tile', 'user_id', 'device_id', 'epoch', 'set']


# Assigning the logical tiles to the training set fingerprints
# Note that we do not group by also building and floor, since the tile names need not be distinct among the different floors. In any case, here I have just a single floor.
group_tiles = dataframe[dataframe['set'] != 'test'][['coord_x', 'coord_y', 'set']].groupby(['coord_x', 'coord_y']).count()
coords_map_tile = {}
tile_count = 0
for ind, row in group_tiles.iterrows():
    coords_map_tile[ind] = 'tile_' + str(tile_count)
    tile_count += 1
    
tilevals = []
for ind, row in dataframe.iterrows():
    if row['set'] == 'training':
        tilevals.append(coords_map_tile[(row['coord_x'],row['coord_y'])])
    else:
        tilevals.append('NULL')

dataframe['tile'] = tilevals

        
dataframe

Unnamed: 0,fingerprint_id,WAP000,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,...,coord_y,coord_z,floor,building,site,tile,user_id,device_id,epoch,set
0,0,-53,,,,,,,,,...,54.633661,,1,11,,tile_178,1,1,,training
1,1,-53,,,,,,,,,...,54.633661,,1,11,,tile_178,1,1,,training
2,2,-53,,,,,,,,,...,54.633661,,1,11,,tile_178,1,1,,training
3,3,-58,,,,,,,,,...,53.462735,,1,11,,tile_192,1,1,,training
4,4,-58,,,,,,,,,...,53.462735,,1,11,,tile_192,1,1,,training
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1712,1712,-78,,,,,,,,,...,71.326591,,1,11,,,1,1,,test
1713,1713,-79,,,,,,,,,...,73.526635,,1,11,,,1,1,,test
1714,1714,-79,,,,,,,,,...,74.395074,,1,11,,,1,1,,test
1715,1715,-64,,,,,,,,,...,73.902958,,1,11,,,1,1,,test


In [73]:
# Generating the file places.csv

places = dataframe[['building', 'floor', 'site']].copy().drop_duplicates()
places.reset_index(inplace=True, drop=True)
places['floor_number'] = places['floor']
places = places[['building', 'floor', 'floor_number', 'site']]
places['site_height'] = 'NULL'
places['site_area'] = 'NULL'
places['floor_height'] = 3.7
places['floor_area'] = 'NULL'
places['building_area'] = 'NULL'

for col in places.columns:
    places[col] = places[col].astype(object)

places.to_csv(work_path + '/places.csv', sep=',', na_rep='NULL', index=False)

places

Unnamed: 0,building,floor,floor_number,site,site_height,site_area,floor_height,floor_area,building_area
0,11,1,1,,,,3.7,,


In [4]:
# Generating the file tessellations.csv

tessellations = dataframe[['building', 'floor', 'site', 'tile']].copy().drop_duplicates()
tessellations['tessellation_type'] = 'logical'
tessellations['coord_a_x'] = 'NULL'
tessellations['coord_a_y'] = 'NULL'
tessellations['coord_b_x'] = 'NULL'
tessellations['coord_b_y'] = 'NULL'
tessellations['coord_c_x'] = 'NULL'
tessellations['coord_c_y'] = 'NULL'
tessellations['coord_d_x'] = 'NULL'
tessellations['coord_d_y'] = 'NULL'
tessellations = tessellations[(tessellations['tile'] != 'NULL')]


for col in tessellations.columns:
    tessellations[col] = tessellations[col].astype(object)
    

tessellations.to_csv(work_path + '/tessellations.csv', sep=',', na_rep='NULL', index=False)

tessellations

Unnamed: 0,building,floor,site,tile,tessellation_type,coord_a_x,coord_a_y,coord_b_x,coord_b_y,coord_c_x,coord_c_y,coord_d_x,coord_d_y
0,11,1,,tile_178,logical,,,,,,,,
2,11,1,,tile_192,logical,,,,,,,,
5,11,1,,tile_197,logical,,,,,,,,
8,11,1,,tile_199,logical,,,,,,,,
11,11,1,,tile_224,logical,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
568,11,1,,tile_115,logical,,,,,,,,
569,11,1,,tile_111,logical,,,,,,,,
571,11,1,,tile_114,logical,,,,,,,,
573,11,1,,tile_121,logical,,,,,,,,


In [5]:
# Generating the file fingerprints.csv

fingerprints = dataframe[['fingerprint_id', 'coord_x', 'coord_y', 'coord_z', 'user_id', 'device_id', 'epoch', 'set', 'building', 'floor', 'site', 'tile']].copy()
fingerprints['tile_id'] = fingerprints.merge(tessellations, on=['building', 'floor', 'site', 'tile'], how='left', suffixes=('_1', ''))['tile'].fillna('NULL').values
fingerprints = fingerprints[['fingerprint_id', 'coord_x', 'coord_y', 'coord_z', 'building', 'floor', 'site', 'tile', 'user_id', 'device_id', 'epoch', 'set']]
fingerprints['user_id'] = [x if x != 0 else 'NULL' for x in fingerprints['user_id']]
fingerprints['is_radio_map'] = [True if x == 'training' else False for x in fingerprints['set']]
fingerprints['preceded_by'] = 'NULL'
fingerprints['followed_by'] = 'NULL'
fingerprints['notes'] = 'NULL'

for col in fingerprints.columns:
    fingerprints[col] = fingerprints[col].astype(object)
    

fingerprints.to_csv(work_path + '/fingerprints.csv', sep=',', na_rep='NULL', index=False)

fingerprints

Unnamed: 0,fingerprint_id,coord_x,coord_y,coord_z,building,floor,site,tile,user_id,device_id,epoch,set,is_radio_map,preceded_by,followed_by,notes
0,0,73.7565,54.6337,,11,1,,tile_178,1,1,,training,True,,,
1,1,73.7565,54.6337,,11,1,,tile_178,1,1,,training,True,,,
2,2,76.1996,53.4627,,11,1,,tile_192,1,1,,training,True,,,
3,3,76.1996,53.4627,,11,1,,tile_192,1,1,,training,True,,,
4,4,76.1996,53.4627,,11,1,,tile_192,1,1,,training,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,919,67.5547,71.3266,,11,1,,,1,1,,test,False,,,
920,920,70.7535,73.5266,,11,1,,,1,1,,test,False,,,
921,921,72.5338,74.3951,,11,1,,,1,1,,test,False,,,
922,922,79.2209,73.903,,11,1,,,1,1,,test,False,,,


In [6]:
# Generating the file wifi_obs.csv

wifi_obs = dataframe[[x for x in dataframe.columns if 'WAP' in x]].copy()
wifi_obs['fingerprint_id'] = dataframe['fingerprint_id']
wifi_obs = wifi_obs[['fingerprint_id'] + [x for x in dataframe.columns if 'WAP' in x]]

for col in wifi_obs.columns:
    wifi_obs[col] = wifi_obs[col].astype(object)
    
wifi_obs.columns = ["AP-" + x + "-NULL" if "WAP" in x else x for x in list(wifi_obs.columns)]

wifi_obs.to_csv(work_path + '/wifi_obs.csv', sep=',', na_rep='NULL', index=False)

wifi_obs

Unnamed: 0,fingerprint_id,AP-WAP000-NULL,AP-WAP001-NULL,AP-WAP002-NULL,AP-WAP003-NULL,AP-WAP004-NULL,AP-WAP005-NULL,AP-WAP006-NULL,AP-WAP007-NULL,AP-WAP008-NULL,...,AP-WAP147-NULL,AP-WAP148-NULL,AP-WAP149-NULL,AP-WAP150-NULL,AP-WAP151-NULL,AP-WAP152-NULL,AP-WAP153-NULL,AP-WAP154-NULL,AP-WAP155-NULL,AP-WAP156-NULL
0,0,-53,,,,,,,,,...,,,,,,,,,,
1,1,-53,,,,,,,,,...,,,,,,,,,,
2,2,-58,,,,,,,,,...,,,,,,,,,,
3,3,-58,,,,,,,,,...,,,,,,,,,,
4,4,-58,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,919,-78,,,,,,,,,...,-87,,,,-96,,,,,
920,920,-79,,,,,,,,,...,,,,,,,,,,
921,921,-79,,,,,,,,,...,,,,,,,,,,
922,922,-64,,,,,,,,,...,,,,,,,,,,


In [None]:
#                                                                                                   #
#                                                                                                   #
################################## PART TO IMPORT DSI-TRAJECTORIES ################################## 
#                                                                                                   #
#                                                                                                   #

In [79]:
dataset_name = 'DSI_trajectories'

work_path = './' + dataset_name


# Reading the RM files; they just contain the train portion of DSI1 and such info is not needed, but we are still reading it

coords = pd.read_csv('./raw_datasets/DSI/' + dataset_name  + '/rm_crd.csv', header=None)
coords.columns = ['coord_x', 'coord_y']
coords['coord_z'] = 'NULL'
coords['floor'] = 1
coords['building'] = 11
coords['site'] = 'NULL'
coords['tile'] = 'crowd_tile_1'

rss = pd.read_csv('./raw_datasets/DSI/' + dataset_name  + '/rm_rss.csv', header=None)
rss[[x for x in rss.columns]] = rss[[x for x in rss.columns]].replace(-150, 'NULL')
rss.reset_index(inplace=True)
rss.columns = ['fingerprint_id'] + ['WAP' + '000'[len(str(x)):] + str(x) for x in rss.columns if x != 'index']

ep = pd.read_csv('./raw_datasets/DSI/' + dataset_name  + '/rm_tms.csv', header=None)
ep.columns = ['epoch']

dataframe_train_part_dsi1 = pd.concat([rss, coords], axis=1)
dataframe_train_part_dsi1['user_id'] = 1
dataframe_train_part_dsi1['device_id'] = 1
dataframe_train_part_dsi1 = pd.concat([dataframe_train_part_dsi1, ep], axis=1)
dataframe_train_part_dsi1['set'] = 'training'


# Reading the TJ files

dataframe = pd.read_csv('./raw_datasets/DSI/' + dataset_name  + '/tj_crd.csv', header=None)
dataframe.columns = ['coord_x', 'coord_y']
dataframe['coord_z'] = 'NULL'
dataframe['floor'] = 1
dataframe['building'] = 11
dataframe['site'] = 'NULL'
dataframe['tile'] = 'crowd_tile_1'
dataframe['user_id'] = 1
dataframe['device_id'] = 1

rss = pd.read_csv('./raw_datasets/DSI/' + dataset_name  + '/tj_rss.csv', header=None)
rss[[x for x in rss.columns]] = rss[[x for x in rss.columns]].replace(-150, 'NULL')
rss.reset_index(inplace=True)
rss.columns = ['fingerprint_id'] + ['WAP' + '000'[len(str(x)):] + str(x) for x in rss.columns if x != 'index']

ep = pd.read_csv('./raw_datasets/DSI/' + dataset_name  + '/tj_tms.csv', header=None)
ep.columns = ['epoch']
ep['set'] = 'training'

dataframe = pd.concat([rss, dataframe, ep], axis=1)
dataframe

Unnamed: 0,fingerprint_id,WAP000,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,...,coord_y,coord_z,floor,building,site,tile,user_id,device_id,epoch,set
0,0,-64,,,,,,,,,...,72.428346,,1,11,,crowd_tile_1,1,1,1462269732,training
1,1,-64,,,,,,,,,...,73.902958,,1,11,,crowd_tile_1,1,1,1462269747,training
2,2,-79,,,,,,,,,...,74.395074,,1,11,,crowd_tile_1,1,1,1462269759,training
3,3,-79,,,,,,,,,...,73.526635,,1,11,,crowd_tile_1,1,1,1462269764,training
4,4,-78,,,,,,,,,...,71.326591,,1,11,,crowd_tile_1,1,1,1462269776,training
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,343,-82,,,,,,,,,...,77.775208,,1,11,,crowd_tile_1,1,1,1462272597,training
344,344,-77,,,,,,,,,...,76.994594,,1,11,,crowd_tile_1,1,1,1462272602,training
345,345,-77,,,,,,,,,...,75.375542,,1,11,,crowd_tile_1,1,1,1462272609,training
346,346,-77,,,,,,,,,...,74.320268,,1,11,,crowd_tile_1,1,1,1462272621,training


In [80]:
# Generating the file places.csv

places = dataframe[['building', 'floor', 'site']].copy().drop_duplicates()
places.reset_index(inplace=True, drop=True)
places['floor_number'] = places['floor']
places = places[['building', 'floor', 'floor_number', 'site']]
places['site_height'] = 'NULL'
places['site_area'] = 'NULL'
places['floor_height'] = 3.7
places['floor_area'] = 'NULL'
places['building_area'] = 'NULL'

for col in places.columns:
    places[col] = places[col].astype(object)

places.to_csv(work_path + '/places.csv', sep=',', na_rep='NULL', index=False)

places

Unnamed: 0,building,floor,floor_number,site,site_height,site_area,floor_height,floor_area,building_area
0,11,1,1,,,,3.7,,


In [81]:
# Generating the file tessellations.csv

tessellations = dataframe[['building', 'floor', 'site', 'tile']].copy().drop_duplicates()
tessellations['tessellation_type'] = 'crowd'
tessellations['coord_a_x'] = 'NULL'
tessellations['coord_a_y'] = 'NULL'
tessellations['coord_b_x'] = 'NULL'
tessellations['coord_b_y'] = 'NULL'
tessellations['coord_c_x'] = 'NULL'
tessellations['coord_c_y'] = 'NULL'
tessellations['coord_d_x'] = 'NULL'
tessellations['coord_d_y'] = 'NULL'
tessellations = tessellations[(tessellations['tile'] != 'NULL')]


for col in tessellations.columns:
    tessellations[col] = tessellations[col].astype(object)
    

tessellations.to_csv(work_path + '/tessellations.csv', sep=',', na_rep='NULL', index=False)

tessellations

Unnamed: 0,building,floor,site,tile,tessellation_type,coord_a_x,coord_a_y,coord_b_x,coord_b_y,coord_c_x,coord_c_y,coord_d_x,coord_d_y
0,11,1,,crowd_tile_1,crowd,,,,,,,,


In [82]:
# Generating the file fingerprints.csv

fingerprints = dataframe[['fingerprint_id', 'coord_x', 'coord_y', 'coord_z', 'user_id', 'device_id', 'epoch', 'set', 'building', 'floor', 'site', 'tile']].copy()
fingerprints['tile_id'] = fingerprints.merge(tessellations, on=['building', 'floor', 'site', 'tile'], how='left', suffixes=('_1', ''))['tile'].fillna('NULL').values
fingerprints = fingerprints[['fingerprint_id', 'coord_x', 'coord_y', 'coord_z', 'building', 'floor', 'site', 'tile', 'user_id', 'device_id', 'epoch', 'set']]
fingerprints['user_id'] = [x if x != 0 else 'NULL' for x in fingerprints['user_id']]
fingerprints['is_radio_map'] = [True if x == 'training' else False for x in fingerprints['set']]
fingerprints['preceded_by'] = 'NULL'
fingerprints['followed_by'] = 'NULL'
fingerprints['notes'] = 'NULL'
    

preceded_column = list(np.asarray(fingerprints['fingerprint_id']) - 1)
preceded_column[0] = 'NULL'
followed_column = list(np.asarray(fingerprints['fingerprint_id']) + 1)
followed_column[-1] = 'NULL'


for col in fingerprints.columns:
    fingerprints[col] = fingerprints[col].astype(object)
    

fingerprints['preceded_by'] = preceded_column
fingerprints['followed_by'] = followed_column

fingerprints.to_csv(work_path + '/fingerprints.csv', sep=',', na_rep='NULL', index=False)

fingerprints

Unnamed: 0,fingerprint_id,coord_x,coord_y,coord_z,building,floor,site,tile,user_id,device_id,epoch,set,is_radio_map,preceded_by,followed_by,notes
0,0,80.2994,72.4283,,11,1,,crowd_tile_1,1,1,1462269732,training,True,,1,
1,1,79.2209,73.903,,11,1,,crowd_tile_1,1,1,1462269747,training,True,0,2,
2,2,72.5338,74.3951,,11,1,,crowd_tile_1,1,1,1462269759,training,True,1,3,
3,3,70.7535,73.5266,,11,1,,crowd_tile_1,1,1,1462269764,training,True,2,4,
4,4,67.5547,71.3266,,11,1,,crowd_tile_1,1,1,1462269776,training,True,3,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,343,74.8637,77.7752,,11,1,,crowd_tile_1,1,1,1462272597,training,True,342,344,
344,344,73.5915,76.9946,,11,1,,crowd_tile_1,1,1,1462272602,training,True,343,345,
345,345,74.2131,75.3755,,11,1,,crowd_tile_1,1,1,1462272609,training,True,344,346,
346,346,72.6953,74.3203,,11,1,,crowd_tile_1,1,1,1462272621,training,True,345,347,


In [83]:
# Generating the file wifi_obs.csv

wifi_obs = dataframe[[x for x in dataframe.columns if 'WAP' in x]].copy()
wifi_obs['fingerprint_id'] = dataframe['fingerprint_id']
wifi_obs = wifi_obs[['fingerprint_id'] + [x for x in dataframe.columns if 'WAP' in x]]

for col in wifi_obs.columns:
    wifi_obs[col] = wifi_obs[col].astype(object)
    
wifi_obs.columns = ["AP-" + x + "-NULL" if "WAP" in x else x for x in list(wifi_obs.columns)]

wifi_obs.to_csv(work_path + '/wifi_obs.csv', sep=',', na_rep='NULL', index=False)

wifi_obs

Unnamed: 0,fingerprint_id,AP-WAP000-NULL,AP-WAP001-NULL,AP-WAP002-NULL,AP-WAP003-NULL,AP-WAP004-NULL,AP-WAP005-NULL,AP-WAP006-NULL,AP-WAP007-NULL,AP-WAP008-NULL,...,AP-WAP147-NULL,AP-WAP148-NULL,AP-WAP149-NULL,AP-WAP150-NULL,AP-WAP151-NULL,AP-WAP152-NULL,AP-WAP153-NULL,AP-WAP154-NULL,AP-WAP155-NULL,AP-WAP156-NULL
0,0,-64,,,,,,,,,...,,,,,,,,,,
1,1,-64,,,,,,,,,...,,,,,,,,,,
2,2,-79,,,,,,,,,...,,,,,,,,,,
3,3,-79,,,,,,,,,...,,,,,,,,,,
4,4,-78,,,,,,,,,...,-87,,,,-96,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,343,-82,,,,,,,,,...,,,,,,,,,,
344,344,-77,,,,,,,,,...,,,,,,,,,,
345,345,-77,,,,,,,,,...,,,,,,,,,,
346,346,-77,,,,,,,,,...,,,,,,,,,,
