In [1]:
from kafka import KafkaProducer
import time
import numpy as np
import pandas as pd

In [2]:
#Takes about one minute to load
data=pd.read_csv("../../data/data.conv.txt.gz",header=None,sep=" ")
data.columns=["Date","Hour","Sensor","Value","Voltage"]
data=data.sort_values(['Date','Hour']).reset_index(drop=True)

data['datetime']=pd.to_datetime(data.Date+' '+data.Hour)
data['relative_datetime']=data['datetime']-data['datetime'][0]
data['seconds']=data['relative_datetime'].dt.total_seconds()

sensorId_type=data.Sensor.str.split("-",expand=True)
sensorId_type.columns=['SensorId','Type']
data['SensorId']=sensorId_type['SensorId'].astype(int)
data['Type']=sensorId_type['Type'].astype(int)

In [3]:
#Drop features not needed for the simulation
data=data.drop(['datetime','relative_datetime','Sensor','Date','Hour','Voltage'],axis=1)

In [4]:
data[:5]

Unnamed: 0,Value,seconds,SensorId,Type
0,17.6364,0.0,41,0
1,16.6956,0.007,44,0
2,45.7037,0.092,41,1
3,2.3,0.237,44,2
4,47.9942,0.285,44,1


In [5]:
temp=data[data.Type==0]
temp=temp.reset_index(drop=True)

In [6]:
# remove data with sensorId > 54
temp = temp[temp.SensorId <= 54]
temp[:5]

Unnamed: 0,Value,seconds,SensorId,Type
0,17.6364,0.0,41,0
1,16.6956,0.007,44,0
2,19.1456,18.034,33,0
3,17.1268,18.288,53,0
4,18.9104,18.298,19,0


In [7]:
day_to_predict = 1 # must be at least 1
temp_train = temp[temp.seconds < day_to_predict*86400]
temp_test = temp[(temp.seconds >= day_to_predict*86400) & (temp.seconds < (day_to_predict+1)*86400)]

In [8]:
temp_sample = temp_train[:20]
print(temp_sample.iloc[0])

Value       17.6364
seconds      0.0000
SensorId    41.0000
Type         0.0000
Name: 0, dtype: float64


In [9]:
producer = KafkaProducer(bootstrap_servers='kafka1:19092,kafka2:29092,kafka3:39092')

In [10]:
# Sensors to predict
sensors_to_predict = [1, 24]
temp_train_y = temp_train[temp_train.SensorId.isin(sensors_to_predict)][:10]
print(temp_train_y[:10])

       Value  seconds  SensorId  Type
64   18.1362   53.492        24     0
79   19.2436   79.124         1     0
81   18.1166   79.230        24     0
114  18.1264  108.864        24     0
146  18.1362  138.638        24     0
188  19.2240  169.155         1     0
224  19.2142  200.931         1     0
247  18.1460  229.040        24     0
305  19.1848  264.276         1     0
321  18.1264  290.305        24     0


In [12]:
import json
# For each temperature measure to predict in sensors_to_predict,
# we pick the temperature of all the other sensors in the same 30s window of time
# and we send the temperature of each neighboring sensors to estimate the true 
# temperature of the sensors_to_predict

time_resolution = 30 # window of time we use to extract other time-related temperatures
interval = 1 # waiting interval between the sending of each temperature to predict

for iy, ry in temp_train_y.iterrows():
    tic = time.time()
    y = ry.Value
    bin_y = ry.seconds//time_resolution
    temp_neighbor = temp_train[(temp_train.seconds//time_resolution == bin_y - 1)  & (~temp_train.SensorId.isin(sensors_to_predict))]
    for ix, rx in temp_neighbor.iterrows():
        x = rx.values
        message = json.dumps(np.append([ry.SensorId, ry.seconds, y], x).tolist())
        if (iy % 1 == 0):
            print(message)
        producer.send('ClusterRLSTrain', message.encode())
    toc = time.time() - tic
    time.sleep(interval - toc)

[24.0, 53.492000000000004, 18.1362, 17.6364, 0.0, 41.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 16.6956, 0.007, 44.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 19.1456, 18.034000000000002, 33.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 17.1268, 18.288, 53.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 18.9104, 18.298000000000002, 19.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 17.6952, 18.416, 28.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 19.2142, 18.431, 10.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 18.8908, 18.478, 21.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 19.8904, 18.486, 4.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 19.5278, 18.631, 3.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 18.685, 18.717000000000002, 17.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 18.0284, 18.759, 45.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 19.1064, 19.181, 8.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 17.8716, 19.446, 16.0, 0.0]
[24.0, 53.492000000000004, 18.1362, 18.1754, 19.538, 26.0, 0.0]
[24.0, 53.49

[24.0, 138.638, 18.1362, 18.8516, 108.072, 36.0, 0.0]
[24.0, 138.638, 18.1362, 19.42, 108.176, 35.0, 0.0]
[24.0, 138.638, 18.1362, 18.1656, 108.31700000000001, 30.0, 0.0]
[24.0, 138.638, 18.1362, 18.391, 108.44000000000001, 25.0, 0.0]
[24.0, 138.638, 18.1362, 18.538, 108.753, 29.0, 0.0]
[24.0, 138.638, 18.1362, 18.3224, 108.834, 22.0, 0.0]
[24.0, 138.638, 18.1362, 17.4796, 108.879, 12.0, 0.0]
[24.0, 138.638, 18.1362, 18.5674, 108.909, 31.0, 0.0]
[24.0, 138.638, 18.1362, 19.224, 109.01400000000001, 10.0, 0.0]
[24.0, 138.638, 18.1362, 16.8328, 109.093, 54.0, 0.0]
[24.0, 138.638, 18.1362, 18.8614, 109.108, 40.0, 0.0]
[24.0, 138.638, 18.1362, 17.8618, 109.19800000000001, 16.0, 0.0]
[24.0, 138.638, 18.1362, 17.8618, 109.34800000000001, 47.0, 0.0]
[24.0, 138.638, 18.1362, 17.6952, 109.385, 28.0, 0.0]
[24.0, 138.638, 18.1362, 17.9402, 109.462, 51.0, 0.0]
[24.0, 138.638, 18.1362, 19.6258, 110.35300000000001, 37.0, 0.0]
[24.0, 138.638, 18.1362, 18.832, 110.697, 38.0, 0.0]
[24.0, 138.638, 18.136

[24.0, 290.305, 18.1264, 17.46, 257.88300000000004, 12.0, 0.0]
[24.0, 290.305, 18.1264, 18.048, 258.04, 11.0, 0.0]
[24.0, 290.305, 18.1264, 18.8712, 258.60200000000003, 40.0, 0.0]
[24.0, 290.305, 18.1264, 18.1656, 258.65700000000004, 30.0, 0.0]
[24.0, 290.305, 18.1264, 18.5576, 258.856, 31.0, 0.0]
[24.0, 290.305, 18.1264, 18.8908, 258.88300000000004, 19.0, 0.0]
[24.0, 290.305, 18.1264, 18.832, 258.947, 36.0, 0.0]
[24.0, 290.305, 18.1264, 19.5082, 259.165, 7.0, 0.0]
[24.0, 290.305, 18.1264, 17.46, 259.23, 42.0, 0.0]
[24.0, 290.305, 18.1264, 18.146, 259.29400000000004, 26.0, 0.0]
[24.0, 290.305, 18.1264, 18.8712, 259.305, 21.0, 0.0]
[24.0, 290.305, 18.1264, 17.7246, 259.428, 28.0, 0.0]
[24.0, 290.305, 18.1264, 17.9794, 259.73900000000003, 45.0, 0.0]
[24.0, 290.305, 18.1264, 19.2044, 260.02000000000004, 10.0, 0.0]
[24.0, 290.305, 18.1264, 18.979, 260.522, 18.0, 0.0]
[24.0, 290.305, 18.1264, 17.2248, 260.629, 49.0, 0.0]
[24.0, 290.305, 18.1264, 17.9108, 260.72, 51.0, 0.0]
[24.0, 290.305, 1