In [75]:
from kafka import KafkaProducer
import time
import numpy as np
import pandas as pd

In [76]:
#Takes about one minute to load
data=pd.read_csv("../../data/data.conv.txt.gz",header=None,sep=" ")
data.columns=["Date","Hour","Sensor","Value","Voltage"]
data=data.sort_values(['Date','Hour']).reset_index(drop=True)

data['datetime']=pd.to_datetime(data.Date+' '+data.Hour)
data['relative_datetime']=data['datetime']-data['datetime'][0]
data['seconds']=data['relative_datetime'].dt.total_seconds()

sensorId_type=data.Sensor.str.split("-",expand=True)
sensorId_type.columns=['SensorId','Type']
data['SensorId']=sensorId_type['SensorId'].astype(int)
data['Type']=sensorId_type['Type'].astype(int)

#Drop features not needed for the simulation
data=data.drop(['datetime','relative_datetime','Sensor','Date','Hour','Voltage'],axis=1)

In [77]:
data[:5]

Unnamed: 0,Value,seconds,SensorId,Type
0,17.6364,0.0,41,0
1,16.6956,0.007,44,0
2,45.7037,0.092,41,1
3,2.3,0.237,44,2
4,47.9942,0.285,44,1


In [78]:
temp=data[data.Type==0]
temp=temp.reset_index(drop=True)

In [79]:
# remove data with sensorId > 54
temp = temp[temp.SensorId <= 54]
temp[:5]

Unnamed: 0,Value,seconds,SensorId,Type
0,17.6364,0.0,41,0
1,16.6956,0.007,44,0
2,19.1456,18.034,33,0
3,17.1268,18.288,53,0
4,18.9104,18.298,19,0


In [80]:
from sklearn.cluster import KMeans
locs = pd.read_csv("../../data/mote_locs.txt", header=None, sep=" ")
locs.columns = ["SensorId", "X", "Y"]
ids = locs.values[:, 0]
pos = locs.values[:, 1:]

kmeans = KMeans(n_clusters=5, random_state=0).fit(pos)
labels = np.vstack((ids, kmeans.labels_)).T.astype(int)
clusters = pd.DataFrame({'SensorId': labels[:,0], 'ClusterId': labels[:,1]})

temp_all = temp.join(clusters.set_index('SensorId'), on='SensorId')
temp_all[:5]

Unnamed: 0,Value,seconds,SensorId,Type,ClusterId
0,17.6364,0.0,41,0,2
1,16.6956,0.007,44,0,3
2,19.1456,18.034,33,0,2
3,17.1268,18.288,53,0,1
4,18.9104,18.298,19,0,0


In [81]:
temp_train = temp_all[temp_all.seconds < 8*86400]
temp_test = temp_all[(temp_all.seconds >= 8*86400) & (temp_all.seconds < 9*86400)]

temp_test[:5]

Unnamed: 0,Value,seconds,SensorId,Type,ClusterId
833169,23.4086,691202.37,52,0,3
833170,22.4384,691202.947,31,0,4
833171,21.9484,691203.112,10,0,1
833172,23.1832,691203.132,40,0,2
833173,22.0758,691203.253,8,0,1


In [104]:
def rollup(temp, interval, day_begin=0):
    # roll up data according to a time interval
    first_temp = day_begin * 86400
    last_temp = temp.iloc[-1].seconds
    time_bins = int((last_temp-first_temp)/interval) + 1
    temp_sync = np.zeros((54, time_bins, 6)) # Value, bin, SensorID, Type, number of elem in bin
    
    for i in range(len(temp)):
        data = temp.iloc[i]
        k = int(data.SensorId - 1)
        bin = int((data.seconds-first_temp)/interval)
        n = temp_sync[k][bin][-1] + 1
        new_value = (temp_sync[k][bin][0] * (n-1) + data.Value)/n # running avg
        temp_sync[k][bin][0] = new_value
        temp_sync[k][bin][1] = bin
        temp_sync[k][bin][2] = int(data.SensorId)
        temp_sync[k][bin][3] = int(data.Type)
        temp_sync[k][bin][4] = int(data.ClusterId)
        temp_sync[k][bin][5] = n
    return temp_sync

def delta_max(temp):
    # evaluate the maximum time interval between two data
    maxT = 0
    for i in range(1,len(temp)):
        time_now = temp.iloc[i].seconds
        time_before = temp.iloc[i-1].seconds
        delta = time_now - time_before
        if delta > maxT:
            maxT = delta
    return maxT


def interpolate(temp_sync, K):
    # interpolate missing data in rolled up data using the data of K neighbours
    # rolled up data referes to the output of rollup function
    # K is K/2 past data and K/2 futur data*
    for k in range(54):
        id = k+1
        for bin in range(len(temp_sync[k])):
            # if no value provided
            left_range = max(0, bin-int(K/2))
            right_range = min(len(temp_sync[k]), bin+int(K/2))
            if temp_sync[k][bin][-1] == 0:
                bins = []
                vals = []
                cluster_id = 99 # 99 = unattributed
                data_type = 99
                for n in range(int(left_range), int(right_range)):
                    if temp_sync[k][n][-1] != 0:
                        cluster_id = temp_sync[k][n][4]
                        data_type = temp_sync[k][n][3]
                        bins.append(n)
                        vals.append(temp_sync[k][n][0])
                try:
                    val_interp = np.interp(bin, bins, vals)
                except:
                    val_interp = 0
                temp_sync[k][bin][0] = val_interp
                temp_sync[k][bin][1] = bin
                temp_sync[k][bin][2] = id
                temp_sync[k][bin][3] = data_type
                temp_sync[k][bin][4] = cluster_id
                temp_sync[k][bin][5] = 1
    return temp_sync

In [100]:
# Create a cleaned temperature vector:
# we want one data every interval, this is not always the case

interval = 30 # seconds

temp_train_sync = rollup(temp_train, interval)
temp_test_sync = rollup(temp_test, interval, day_begin=8)

print('Number of intervals of length %d in train set: %d' %(interval, len(temp_train_sync[0])))
print('Number of intervals of length %d in test set: %d' %(interval, len(temp_test_sync[0])))

Number of intervals of length 30 in train set: 23040
Number of intervals of length 30 in test set: 2880


In [101]:
temp_test_sync[:10]

array([[[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  2.26932000e+01,   1.00000000e+00,   1.00000000e+00,
           0.00000000e+00,   2.00000000e+00,   1.00000000e+00],
        [  2.26834000e+01,   2.00000000e+00,   1.00000000e+00,
           0.00000000e+00,   2.00000000e+00,   1.00000000e+00],
        ..., 
        [  2.26932000e+01,   2.87700000e+03,   1.00000000e+00,
           0.00000000e+00,   2.00000000e+00,   1.00000000e+00],
        [  2.26932000e+01,   2.87800000e+03,   1.00000000e+00,
           0.00000000e+00,   2.00000000e+00,   1.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00]],

       [[  2.29284000e+01,   0.00000000e+00,   2.00000000e+00,
           0.00000000e+00,   2.00000000e+00,   1.00000000e+00],
        [  2.29186000e+01,   1.00000000e+00,   2.00000000e+00,
           0.00000000e+00,   2.0

In [106]:
K = 12 # interpolation memory, choose according to interval (and eventually maxT)
temp_train_sync = interpolate(temp_train_sync, K)
temp_test_sync = interpolate(temp_test_sync, K)
temp_train_sync[5,:10,:]

array([[ 19.4984    ,   0.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.4984    ,   1.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.4984    ,   2.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.4984    ,   3.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.49186667,   4.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.48533333,   5.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.4788    ,   6.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.47226667,   7.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.46573333,   8.        ,   6.        ,   0.        ,
          1.        ,   1.        ],
       [ 19.4592    ,   9.        ,   6.        ,   0.        ,
          1.        ,   1. 