In [1]:
from kafka import KafkaProducer
import time
import numpy as np
import pandas as pd

In [2]:
#Takes about one minute to load
data=pd.read_csv("../../data/data.conv.txt.gz",header=None,sep=" ")
data.columns=["Date","Hour","Sensor","Value","Voltage"]
data=data.sort_values(['Date','Hour']).reset_index(drop=True)

data['datetime']=pd.to_datetime(data.Date+' '+data.Hour)
data['relative_datetime']=data['datetime']-data['datetime'][0]
data['seconds']=data['relative_datetime'].dt.total_seconds()

sensorId_type=data.Sensor.str.split("-",expand=True)
sensorId_type.columns=['SensorId','Type']
data['SensorId']=sensorId_type['SensorId'].astype(int)
data['Type']=sensorId_type['Type'].astype(int)

In [3]:
#Drop features not needed for the simulation
data=data.drop(['datetime','relative_datetime','Sensor','Date','Hour','Voltage'],axis=1)

In [4]:
data[:5]

Unnamed: 0,Value,seconds,SensorId,Type
0,17.6364,0.0,41,0
1,16.6956,0.007,44,0
2,45.7037,0.092,41,1
3,2.3,0.237,44,2
4,47.9942,0.285,44,1


In [58]:
temp=data[data.Type==0]
temp=temp.reset_index(drop=True)

In [59]:
# Sensors to predict
sensors_to_predict = [1, 24]
temp = temp[temp.SensorId <= 54]
print(temp[:10])

     Value  seconds  SensorId  Type
0  17.6364    0.000        41     0
1  16.6956    0.007        44     0
2  19.1456   18.034        33     0
3  17.1268   18.288        53     0
4  18.9104   18.298        19     0
5  17.6952   18.416        28     0
6  19.2142   18.431        10     0
7  18.8908   18.478        21     0
8  19.8904   18.486         4     0
9  19.5278   18.631         3     0


In [69]:
day_to_predict = 3 # must be at least 1
temp_train = temp[(temp.seconds < day_to_predict*86400) & (temp.seconds > (day_to_predict-1)*86400)]
temp_train = temp_train.dropna()
#temp_test = temp[(temp.seconds >= day_to_predict*86400) & (temp.seconds < (day_to_predict+1)*86400)]

In [70]:
temp_train.isnull().any().any()

False

In [71]:
def rollup(temp, interval, day_begin=0):
    # roll up data according to a time interval
    first_temp = day_begin*86400
    last_temp = temp.iloc[-1].seconds
    time_bins = int((last_temp-first_temp)/interval) + 1
    temp_sync = np.zeros((time_bins, 5)) # Value, bin, SensorID, Type, number of elem in bin

    for i in range(len(temp)):
        data = temp.iloc[i]
        bin = int((data.seconds-first_temp)/interval)
        n = temp_sync[bin][-1] + 1
        new_value = (temp_sync[bin][0] * (n-1) + data.Value)/n # running avg
        temp_sync[bin][0] = new_value
        temp_sync[bin][1] = bin
        temp_sync[bin][2] = int(data.SensorId)
        temp_sync[bin][3] = int(data.Type)
        temp_sync[bin][4] = n
    return temp_sync


def delta_max(temp):
    # evaluate the maximum time interval between two data
    maxT = 0
    for i in range(1,len(temp)):
        time_now = temp.iloc[i].seconds
        time_before = temp.iloc[i-1].seconds
        delta = time_now - time_before
        if delta > maxT:
            maxT = delta
    return maxT


def interpolate(temp_sync, nbin_left, nbin_right):
    # interpolate missing data in rolled up data using the data of K neighbours
    # rolled up data referes to the output of rollup function
    # nbin_left: number of past data used for interpolation
    # nbin_right: number of futur data used for interpolation
    for bin in range(len(temp_sync)):
        # if no value provided
        left_range = max(0, bin-nbin_left)
        right_range = min(len(temp_sync), bin+nbin_right)
        if temp_sync[bin][-1] == 0:
            bins = []
            vals = []
            sensorId = 0
            for n in range(int(left_range), int(right_range)):
                if temp_sync[n][-1] != 0:
                    bins.append(n)
                    vals.append(temp_sync[n][0])
                    sensorId = int(temp_sync[n][2])
            val_interp = np.interp(bin, bins, vals)
            temp_sync[bin][0] = val_interp
            temp_sync[bin][1] = bin
            temp_sync[bin][2] = sensorId
            temp_sync[bin][3] = 0
            temp_sync[bin][4] = 1
    return temp_sync

In [84]:
delta_max_train = delta_max(temp_train)
print(delta_max_train)

21.595


In [72]:
# Create a cleaned temperature vector:
# we want one data every interval, this is not always the case in the raw data
interval = 30 # seconds
nb_sensors = 54
temp_sync = [0] * nb_sensors

for i in range(nb_sensors):
    sensor_id = i + 1 
    temp_train_i = temp_train[temp_train.SensorId == sensor_id]
    try:
        temp_sync[i] = rollup(temp_train_i, interval, day_to_predict-1)
    except IndexError:
        # if empy data fram
        temp_sync[i] = np.zeros((86400//interval, 5))
    if sensor_id in sensors_to_predict:
        print('Number of intervals of %ds for sensor %d: %d' %(interval, sensor_id, len(temp_sync[i])))

Number of intervals of 30s for sensor 1: 2880
Number of intervals of 30s for sensor 24: 2876


In [73]:
temp_train[temp_train.SensorId == 4][:5]

Unnamed: 0,Value,seconds,SensorId,Type
274756,19.91,172853.033,4,0
274793,19.9002,172884.38,4,0
274978,19.8904,173063.151,4,0
275039,19.8904,173105.731,4,0
275064,19.8708,173126.026,4,0


In [74]:
print('Sensor 1:')
print(temp_sync[0][:10])
print('Sensor 24:')
print(temp_sync[23][:10])

Sensor 1:
[[ 19.3269   0.       1.       0.       2.    ]
 [  0.       0.       0.       0.       0.    ]
 [ 19.3416   2.       1.       0.       1.    ]
 [  0.       0.       0.       0.       0.    ]
 [ 19.3514   4.       1.       0.       1.    ]
 [  0.       0.       0.       0.       0.    ]
 [ 19.322    6.       1.       0.       1.    ]
 [ 19.322    7.       1.       0.       1.    ]
 [  0.       0.       0.       0.       0.    ]
 [  0.       0.       0.       0.       0.    ]]
Sensor 24:
[[ 19.74666667   0.          24.           0.           3.        ]
 [ 19.3808       1.          24.           0.           1.        ]
 [ 19.3808       2.          24.           0.           1.        ]
 [ 19.3808       3.          24.           0.           1.        ]
 [ 19.3808       4.          24.           0.           1.        ]
 [ 19.371        5.          24.           0.           1.        ]
 [ 19.371        6.          24.           0.           1.        ]
 [ 19.3612       7.   

In [75]:
K = 10 # interpolation memory, choose according to interval (and eventually maxT)
for i in range(nb_sensors):
    try:
        temp_sync[i] = interpolate(temp_sync[i], K, K)
    except ValueError:
        temp_sync[i] = np.empty((0, 0))
    
print('Sensor 1:')
print(temp_sync[0][:5])
print('Sensor 24:')
print(temp_sync[23][:5])

Sensor 1:
[[ 19.3269    0.        1.        0.        2.     ]
 [ 19.33425   1.        1.        0.        1.     ]
 [ 19.3416    2.        1.        0.        1.     ]
 [ 19.3465    3.        1.        0.        1.     ]
 [ 19.3514    4.        1.        0.        1.     ]]
Sensor 24:
[[ 19.74666667   0.          24.           0.           3.        ]
 [ 19.3808       1.          24.           0.           1.        ]
 [ 19.3808       2.          24.           0.           1.        ]
 [ 19.3808       3.          24.           0.           1.        ]
 [ 19.3808       4.          24.           0.           1.        ]]


In [76]:
producer = KafkaProducer(bootstrap_servers='kafka1:19092,kafka2:29092,kafka3:39092')

In [77]:
import json
# For each temperature measure to predict in sensors_to_predict,
# we pick the temperature of all the other sensors in the same 30s window of time
# and we send the temperature of each neighboring sensors to estimate the true 
# temperature of the sensors_to_predict

time_resolution = 30 # window of time we use to extract other time-related temperatures
interval = 0.5 # waiting interval between the sending of each temperature to predict
binTot = 2880

step = int(86400/interval) # number of bins in one day
start_t = 0 # (n+1)th day
end_t = start_t + day_to_predict*step

for bin in range(1, binTot):
    tic = time.time()
    temp_neighbor = []
    for i in sensors_to_predict:
        y_bin = temp_sync[i-1][bin]
        for k in range(nb_sensors):
            if (temp_sync[k].size != 0) and (k+1 not in sensors_to_predict) and (len(temp_sync[k]) > bin-1):
                temp_neighbor.append((k+1, temp_sync[k][bin-1]))
        for x in temp_neighbor:
            id_n = x[0]
            val_n = x[1]
            d = [i, bin, float(y_bin[0]), float(val_n[0]), bin-1, int(id_n), 0]
            message = json.dumps(d)
            #if bin % 1 == 0:
            #    print(message)
            producer.send('EMA', message.encode())
    toc = time.time() - tic
    time.sleep(interval - toc)
    
    
    


IndexError: index 2876 is out of bounds for axis 0 with size 2876

In [None]:
for iy, ry in temp_sync_y.iterrows():
    tic = time.time()
    y = ry.Value
    bin_y = ry.seconds//time_resolution
    temp_neighbor = temp_sync[(temp_sync.seconds//time_resolution == bin_y - 1)  & (~temp_sync.SensorId.isin(sensors_to_predict))]
    for ix, rx in temp_neighbor.iterrows():
        x = rx.values
        message = json.dumps(np.append([ry.SensorId, ry.seconds, y], x).tolist())
        if (iy % 1 == 0):
            print(message)
        producer.send('EMA', message.encode())
    toc = time.time() - tic
    time.sleep(interval - toc)