# Bluetooth RSSI - train & test dataset definition

## Parameters

In [1]:


INPUT_DIRS          = [ ]
USE_PIXEL_DATA = True
USE_XIAOMI_DATA = True

if USE_PIXEL_DATA:
    INPUT_DIRS.append('data/Pixel/Bluetooth')
if USE_XIAOMI_DATA:
    INPUT_DIRS.append('data/Xiaomi/Bluetooth')
    
EXCLUDE_PATTERNS = [
    'atrio_entre',
    'teste_final'
]
EXCLUDE_BEACONS = ['ef3b3dd2c001']
FILE_EXTENSION = '.csv'
BEACON_NOT_VISIBLE_VALUE = -200
MERGE_ATRIO_LABELS       = False
TIME_GRANULARITY = 1_000_000_000 
TEST_SPLIT_FRACTION = 0.2

## Imports

In [2]:
import numpy as np
import os
import os.path
import pandas as pd
import sklearn.model_selection
import sys


## Label conversion

In [3]:
LABEL_CONV = {
  "diversidade_de_formas" : "DF",
  "atrio_caes_principio_etico" : "A6", #"AT_CA",
  "globo" : "GL",
  "teatro_dos_sentidos" : "TS",
  "atrio_a_que_cheira" : "A3", # "AT_CH",
  "comer_e_nao_ser_comido" : "CN",
  "diversidade_genetica_incerteza" : "DG",
  "especiacao" : "ES",
  "atrio_ovos_principio_estetico" : "A8", #"AT_O2",
  "selecao_sexual" : "SS",
  "terra_mar_ar" : "TMA",
  "atrio_ovo_esferico" : "A7", #"AT_O1",
  "atrio_medicamentos_principio_cientifico" : "A2", #"AT_M",
  "selecao_artificial" : "SA",
  "hall_primeiro_andar" : "HA",
  "selecao_natural" : "SN",
  "diversidade_de_cores" : "DC",
  "atrio_sementes_principio_economico" : "A4", #"AT_S",
  "atrio_mamiferos_cadeira" : "A5", # "AT_I2",
  "analogia_homologia" : "AH",
  "atrio_diluicao_como_espetaculo" : "A1", # "AT_I1",
  "atrio" : "A"
}
LABEL_CONV = {
  "diversidade_de_formas" : "DF",
  "atrio_caes_principio_etico" : "AT_CA",
  "globo" : "GL",
  "teatro_dos_sentidos" : "TS",
  "atrio_a_que_cheira" : "AT_CH",
  "comer_e_nao_ser_comido" : "CN",
  "diversidade_genetica_incerteza" : "DG",
  "especiacao" : "ES",
  "atrio_ovos_principio_estetico" : "AT_O2",
  "selecao_sexual" : "SS",
  "terra_mar_ar" : "TMA",
  "atrio_ovo_esferico" : "AT_O1",
  "atrio_medicamentos_principio_cientifico" : "AT_M",
  "selecao_artificial" : "SA",
  "hall_primeiro_andar" : "HA",
  "selecao_natural" : "SN",
  "diversidade_de_cores" : "DC",
  "atrio_sementes_principio_economico" : "AT_S",
  "atrio_mamiferos_cadeira" : "AT_I2",
  "analogia_homologia" : "AH",
  "atrio_diluicao_como_espetaculo" : "AT_I1",
  "atrio" : "A"
}


## Process input files

In [4]:
input_files = [
    os.path.join(d,f) for d in INPUT_DIRS for f in os.listdir(d) \
    if f.endswith(FILE_EXTENSION) and [ p for p in EXCLUDE_PATTERNS if p in f ] == []
]
for f in input_files:
    print(f)

data/Pixel/Bluetooth/1223832739980025_diversidade_de_formas.csv
data/Pixel/Bluetooth/1223693227408262_selecao_artificial.csv
data/Pixel/Bluetooth/87361071907453_globo.csv
data/Pixel/Bluetooth/1223564953187932_analogia_homologia.csv
data/Pixel/Bluetooth/86019977985834_atrio_sementes_principio_economico.csv
data/Pixel/Bluetooth/1218534582745199_atrio_diluicao_como_espetaculo.csv
data/Pixel/Bluetooth/85545998188408_atrio_ovos_principio_estetico.csv
data/Pixel/Bluetooth/1222850011619627_selecao_sexual.csv
data/Pixel/Bluetooth/84821111871196_atrio_medicamentos_principio_cientifico.csv
data/Pixel/Bluetooth/1218268340520551_atrio_a_que_cheira.csv
data/Pixel/Bluetooth/1223978156194824_comer_e_nao_ser_comido.csv
data/Pixel/Bluetooth/1224150232810716_selecao_natural.csv
data/Pixel/Bluetooth/85313083111664_atrio_ovo_esferico.csv
data/Pixel/Bluetooth/1223406240293683_teatro_dos_sentidos.csv
data/Pixel/Bluetooth/86330288461695_atrio_mamiferos_cadeira.csv
data/Pixel/Bluetooth/1223265132794627_divers

In [5]:
COLS = [
    'time', 
    'beacon', 
    'rssi'
]
TYPES = {
    'time': np.int64, 
    'beacon': 'string', 
    'rssi': np.int8 
}
global_df = pd.DataFrame([])
for f in input_files:
    print('processing', f)
    label = '_'.join(os.path.splitext(os.path.basename(f))[0].split("_")[1:])
    if MERGE_ATRIO_LABELS and label.startswith('atrio'):
        label = 'atrio'
    label = LABEL_CONV[label]
    df = pd.read_csv(f, 
                     header=None, 
                     names=COLS,
                     dtype=TYPES)
    # print(df.columns.values)
    df['time'] = df['time'] // TIME_GRANULARITY
    df['time'] = df['time'] - df['time'][0]
    df = df[~df['beacon'].isin(EXCLUDE_BEACONS)]
    df.loc[:,'beacon'] = df.loc[:, 'beacon'].str.lower()
    df = df.sort_values(by=['time','beacon'])
    df = df.groupby(by=['time','beacon']).mean()
    df['label'] = label
    df['source'] = 'Pixel' if 'Pixel' in f else 'Xiaomi'
    print("{}: {} data items".format(label,len(df)))
    if len(global_df) == 0:
        global_df = df
    else:
        global_df = global_df._append(df)
global_df = global_df.reset_index().sort_values(by=['label', 'source','time'])
print("{} data items".format(len(global_df)))

processing data/Pixel/Bluetooth/1223832739980025_diversidade_de_formas.csv
DF: 1304 data items
processing data/Pixel/Bluetooth/1223693227408262_selecao_artificial.csv
SA: 1697 data items
processing data/Pixel/Bluetooth/87361071907453_globo.csv
GL: 1425 data items
processing data/Pixel/Bluetooth/1223564953187932_analogia_homologia.csv
AH: 1284 data items
processing data/Pixel/Bluetooth/86019977985834_atrio_sementes_principio_economico.csv
AT_S: 2511 data items
processing data/Pixel/Bluetooth/1218534582745199_atrio_diluicao_como_espetaculo.csv
AT_I1: 2757 data items
processing data/Pixel/Bluetooth/85545998188408_atrio_ovos_principio_estetico.csv
AT_O2: 2554 data items
processing data/Pixel/Bluetooth/1222850011619627_selecao_sexual.csv
SS: 873 data items
processing data/Pixel/Bluetooth/84821111871196_atrio_medicamentos_principio_cientifico.csv
AT_M: 2554 data items
processing data/Pixel/Bluetooth/1218268340520551_atrio_a_que_cheira.csv
AT_CH: 2467 data items
processing data/Pixel/Bluetoot

## Show resulting data frame

In [6]:
global_df

Unnamed: 0,time,beacon,rssi,label,source
4426,0,e9bdcc7d8fe6,-88.000000,AH,Pixel
4427,0,ef3b3dd2a011,-90.000000,AH,Pixel
4428,0,ef3b3dd2e007,-84.000000,AH,Pixel
4429,0,f80332eda645,-84.000000,AH,Pixel
4430,1,6859b8e6126b,-80.000000,AH,Pixel
...,...,...,...,...,...
70158,122,ef3b3dd2a018,-91.666667,TS,Xiaomi
70159,122,ef3b3dd2a019,-84.500000,TS,Xiaomi
70160,122,ef3b3dd2e003,-96.000000,TS,Xiaomi
70161,122,ef3b3dd2e007,-88.000000,TS,Xiaomi


## Counts per beacon

In [7]:
df = global_df.groupby(by=['beacon']).size()
print(df.count(), 'distinct beacons')
df

27 distinct beacons


beacon
6859b8e6126b    1879
c69294c41e47    2860
e7b2d23d89ec    3835
e9bdcc7d8fe6    3564
ef3b3dd2a002    3980
ef3b3dd2a003    2638
ef3b3dd2a005    3915
ef3b3dd2a006    3779
ef3b3dd2a008    3648
ef3b3dd2a009    3896
ef3b3dd2a011    3821
ef3b3dd2a012    4352
ef3b3dd2a013    3835
ef3b3dd2a014    3879
ef3b3dd2a015    3519
ef3b3dd2a016    3349
ef3b3dd2a018    3245
ef3b3dd2a019    3216
ef3b3dd2a020    3415
ef3b3dd2e001    1581
ef3b3dd2e003    3980
ef3b3dd2e004     701
ef3b3dd2e006    2230
ef3b3dd2e007    2706
ef3b3dd2e008    2619
ef3b3dd2e009    1832
f80332eda645    3329
dtype: int64

## Counts per label

In [8]:
df=global_df.groupby(by=['label']).size()
print(df.count(), 'distinct labels')
df

21 distinct labels


label
AH       3246
AT_CA    5123
AT_CH    4845
AT_I1    5054
AT_I2    5026
AT_M     5096
AT_O1    5052
AT_O2    4828
AT_S     4836
CN       3873
DC       4241
DF       3097
DG       3839
ES       3603
GL       3140
HA       4008
SA       3927
SN       4252
SS       2297
TMA      3682
TS       2538
dtype: int64

## Derive dataset from data

In [9]:
final_df = pd.pivot_table(global_df, 
                    values='rssi', 
                    fill_value=BEACON_NOT_VISIBLE_VALUE,
                    index=['time','label','source'],
                    columns=['beacon'])

final_df.reset_index(inplace=True)
final_df.sort_values(by=['label', 'source','time'],inplace=True)
# final_df=final_df.drop(columns='time')
final_df

beacon,time,label,source,6859b8e6126b,c69294c41e47,e7b2d23d89ec,e9bdcc7d8fe6,ef3b3dd2a002,ef3b3dd2a003,ef3b3dd2a005,...,ef3b3dd2a019,ef3b3dd2a020,ef3b3dd2e001,ef3b3dd2e003,ef3b3dd2e004,ef3b3dd2e006,ef3b3dd2e007,ef3b3dd2e008,ef3b3dd2e009,f80332eda645
0,0,AH,Pixel,-200.0,-200.0,-200.0,-88.000000,-200.000000,-200.0,-200.0,...,-200.00,-200.0,-200.0,-200.00,-200.0,-200.0,-84.000000,-200.0,-200.0,-84.000000
42,1,AH,Pixel,-80.0,-200.0,-200.0,-88.333333,-200.000000,-200.0,-100.0,...,-200.00,-200.0,-200.0,-85.00,-200.0,-200.0,-83.400000,-200.0,-200.0,-80.250000
84,2,AH,Pixel,-78.0,-200.0,-102.0,-86.333333,-200.000000,-200.0,-200.0,...,-200.00,-200.0,-200.0,-84.25,-200.0,-200.0,-85.428571,-200.0,-200.0,-83.250000
126,3,AH,Pixel,-77.0,-200.0,-102.0,-88.000000,-200.000000,-200.0,-97.0,...,-200.00,-200.0,-200.0,-86.00,-200.0,-200.0,-85.000000,-200.0,-200.0,-88.000000
168,4,AH,Pixel,-72.5,-200.0,-101.0,-88.000000,-200.000000,-200.0,-101.0,...,-200.00,-200.0,-200.0,-80.40,-200.0,-200.0,-90.750000,-200.0,-200.0,-78.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4507,114,TS,Xiaomi,-67.5,-200.0,-99.5,-78.600000,-200.000000,-200.0,-96.0,...,-88.50,-200.0,-96.0,-84.75,-200.0,-200.0,-87.200000,-200.0,-200.0,-79.200000
4546,115,TS,Xiaomi,-74.5,-103.0,-95.5,-75.666667,-95.666667,-200.0,-97.5,...,-93.00,-200.0,-200.0,-78.50,-200.0,-200.0,-200.000000,-200.0,-200.0,-69.333333
4710,120,TS,Xiaomi,-79.0,-200.0,-200.0,-72.500000,-97.333333,-200.0,-91.0,...,-86.00,-200.0,-97.0,-91.75,-200.0,-200.0,-200.000000,-200.0,-200.0,-86.750000
4730,121,TS,Xiaomi,-91.0,-200.0,-200.0,-78.000000,-91.000000,-200.0,-92.0,...,-83.25,-200.0,-98.0,-90.00,-200.0,-99.0,-200.000000,-200.0,-200.0,-92.600000


## Counts per label

In [10]:
df=final_df.groupby(by=['label']).size()
print(df.count(), 'distinct labels')
df

21 distinct labels


label
AH       228
AT_CA    227
AT_CH    219
AT_I1    231
AT_I2    223
AT_M     234
AT_O1    224
AT_O2    226
AT_S     227
CN       228
DC       229
DF       239
DG       228
ES       234
GL       228
HA       229
SA       236
SN       229
SS       229
TMA      224
TS       230
dtype: int64

## Save it to CSV

In [11]:

def save_dataset(df,id):
    fname = 'datasets/brssi/' + id + '.tsv'
    print('=== Saving - id', id, fname)
    df.to_csv(path_or_buf=fname, sep='\t', index=False,header=True)
    print(df.groupby(by=['label']).size())

train_data, test_data = sklearn.model_selection.train_test_split(final_df, test_size = TEST_SPLIT_FRACTION, random_state=42)

save_dataset(train_data, 'train')
save_dataset(test_data, 'TS')

=== Saving - id train datasets/brssi/train.tsv
label
AH       169
AT_CA    179
AT_CH    172
AT_I1    180
AT_I2    176
AT_M     190
AT_O1    178
AT_O2    184
AT_S     187
CN       196
DC       170
DF       185
DG       192
ES       186
GL       184
HA       182
SA       197
SN       190
SS       184
TMA      178
TS       182
dtype: int64
=== Saving - id TS datasets/brssi/TS.tsv
label
AH       59
AT_CA    48
AT_CH    47
AT_I1    51
AT_I2    47
AT_M     44
AT_O1    46
AT_O2    42
AT_S     40
CN       32
DC       59
DF       54
DG       36
ES       48
GL       44
HA       47
SA       39
SN       39
SS       45
TMA      46
TS       48
dtype: int64
