# Transform Sensor Dataset to Data X-Ray Input
Data Source: http://db.csail.mit.edu/labdata/labdata.html

In [22]:
import pandas as pd
import numpy as np

balance_factor = 1  # sampling fraction for correct tuples
error_target = 'temperature'  # Name of the column for which we search failure causes
min_thresh, max_thresh = 10, 50  # Error thresholds

In [27]:
raw = pd.read_csv('data/intel-lab-data.txt', sep=' ')
raw.head()

Unnamed: 0,date,time,epoch,moteid,temperature,humidity,light,voltage
0,2004-03-31,03:38:15.757551,2,1.0,122.153,-3.91901,11.04,2.03397
1,2004-02-28,00:59:16.02785,3,1.0,19.9884,37.0933,45.08,2.69964
2,2004-02-28,01:03:16.33393,11,1.0,19.3024,38.4629,45.08,2.68742
3,2004-02-28,01:06:16.013453,17,1.0,19.1652,38.8039,45.08,2.68742
4,2004-02-28,01:06:46.778088,18,1.0,19.175,38.8379,45.08,2.69964


In [28]:
print(raw.shape)
raw = raw[pd.notna(raw).all(axis=1)]
raw['is_correct'] = (raw[error_target] >= min_thresh) & (raw[error_target] >= max_thresh)
print(raw.shape, raw['is_correct'].sum())

(2313682, 8)
(2219803, 9) 391135


In [29]:
raw['voltage'] = raw['voltage'].apply(lambda v: round(v, 1))
raw['voltage_buckets'] = pd.cut(raw['voltage'], [-np.inf, 2, 2.3, 2.6, 3, np.inf])
raw['voltage'] = raw['voltage_buckets'].astype(str) + '--///--' + raw['voltage'].astype(str)
raw.drop('voltage_buckets', axis=1, inplace=True)

raw['temperature'] = pd.cut(raw['temperature'], [-np.inf, 0, 16, 20, 25, 30, np.inf])
raw['humidity'] = pd.cut(raw['humidity'], [-np.inf, 0, 15, 30, 35, 40, 45, 60, 80, 100, np.inf])
raw['light'] = pd.cut(raw['light'], [-np.inf, 1, 20, 50, 100, 500, 1000, np.inf])

In [30]:
raw['time'] = raw['time'].apply(lambda t: t[:2])
raw['time'] = raw['time'].apply(lambda x: 'MORNING--///--' + x if 7 <= int(x) < 12 \
                                        else 'DAY--///--' + x if 12 <= int(x) < 17 \
                                        else 'EVENING--///--' + x if 17 <= int(x) < 21 \
                                        else 'NIGHT--///--' + x)

In [31]:
if balance_factor != 1:
    correct_rows = raw[raw['is_correct']]
    error_rows = raw[~raw['is_correct']]
    raw = pd.concat([error_rows, correct_rows.sample(frac=balance_factor, random_state=0)])
print(raw.shape, raw['is_correct'].sum())

(2219803, 9) 391135


In [32]:
raw.head()

Unnamed: 0,date,time,epoch,moteid,temperature,humidity,light,voltage,is_correct
0,2004-03-31,NIGHT--///--03,2,1.0,"(30.0, inf]","(-inf, 0.0]","(1.0, 20.0]","(-inf, 2.0]--///--2.0",True
1,2004-02-28,NIGHT--///--00,3,1.0,"(16.0, 20.0]","(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False
2,2004-02-28,NIGHT--///--01,11,1.0,"(16.0, 20.0]","(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False
3,2004-02-28,NIGHT--///--01,17,1.0,"(16.0, 20.0]","(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False
4,2004-02-28,NIGHT--///--01,18,1.0,"(16.0, 20.0]","(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False


In [33]:
raw.drop(['date', 'epoch', error_target], axis=1, inplace=True)

In [34]:
raw.head()

Unnamed: 0,time,moteid,humidity,light,voltage,is_correct
0,NIGHT--///--03,1.0,"(-inf, 0.0]","(1.0, 20.0]","(-inf, 2.0]--///--2.0",True
1,NIGHT--///--00,1.0,"(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False
2,NIGHT--///--01,1.0,"(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False
3,NIGHT--///--01,1.0,"(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False
4,NIGHT--///--01,1.0,"(35.0, 40.0]","(20.0, 50.0]","(2.6, 3.0]--///--2.7",False


In [35]:
feature_vector =   'a:a:a:a:a:'
structure_vector = '0:0:0:0:0:'
max_dims =         '2:1:1:1:2:'

assert(len(feature_vector.split(':')) == len(raw.columns))

In [36]:
error_rate = 1 - (raw['is_correct'].sum() / raw.shape[0])
cost = 100.0
error_rate

0.8237974270689786

In [37]:
top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(raw.shape[0]) + ';0;'
top_row

'a:a:a:a:a:\t2:1:1:1:2:;0.8237974270689786;100.0;false;a:a:a:a:a:;0:0:0:0:0:;2219803;0;'

In [38]:
truth_vals = raw.loc[:, 'is_correct']
entries = raw.loc[:, raw.columns != 'is_correct'].astype(str)

In [39]:
raw['input-str'] = np.sum('a--///--'+entries+'--///----//--', axis=1)
raw['input-str'] = raw['input-str'].str.replace(':', '/COLON/').str.replace(';','/SEMICOLON/').str.replace('=', '/EQ/').str.replace('%', '/PERCENT/').str.replace('_','/UNDERSCORE/').str.replace('--//--',':').str.replace('--///--','_')

In [40]:
with open('./data/sensor-{}-{}-{}-balance{}-input.txt'.format(error_target, min_thresh, max_thresh, balance_factor), 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, truth_vals.iloc[i], raw['input-str'].iloc[i])) for i in range(len(raw)))