# CS6301 IOT Project 2

## Fault Diagnostic Tool (Read Side)

This side of the application ingests data from the data generator and ensures it's being imported into the database correctly.

### Before Running this Notebook
1. Ensure the database is running on your localhost. Using docker-compose file in the root of the project directory we can start the InfluxDB with the following command from the root directory:
`dc up timeseriesdb`

2. Turn on the data generator with the following command from the "iot" directory.
`java -jar datagen-2.2-SNAPSHOT.jar -offline`

## Connect to Database

In [1]:
!pip install influxdb



In [73]:
from influxdb import InfluxDBClient
import json
import numpy as np
import pandas as pd
import datetime

In [74]:
client = InfluxDBClient(host='localhost', port=8086, username='admin', password='password')

In [3]:
client.get_list_database()

[{'name': 'timeseriesdb'}, {'name': '_internal'}]

## Ingest Data (Example: Offline Data)
In this section we will ingest the data from a text file for exploration purposes. Normally this data would be ingesteddirctly from the Java data generator.

In [160]:
client.drop_database('timeseriesdb')

In [161]:
# client.create_database('timeseriesdb')
client.query('create database timeseriesdb duration 15m')
client.switch_database('timeseriesdb')

In [15]:
f = open('data/offline-train-SMALL.txt', 'r')
lines = f.readlines()
f.close()

In [16]:
len(lines)

26452

In [17]:
measurement = 'gear_metrics'

In [18]:
data = []

In [19]:
x = json.loads(lines[0])
x

{'metric': 'offline',
 'timestamp': 0,
 'label': 0,
 'sr': 97656.0,
 'rate': 25.0,
 'gs': 0.8407559,
 'load': '270.0'}

In [20]:
data = []
for line in lines:
    json_line = json.loads(line)
    # Form: 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=0.8407559,load=270.0,timestamp=0'
    data.append("{},metric={} label={},sr={},rate={},gs={},load={},timestamp={}".format(measurement, json_line['metric'], json_line['label'], json_line['sr'], json_line['rate'], json_line['gs'], json_line['load'], json_line['timestamp']))
    
    

In [21]:
data[:10]

['gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=0.8407559,load=270.0,timestamp=0',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=0.5152432,load=270.0,timestamp=1',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=-0.03834483,load=270.0,timestamp=2',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=1.184862,load=270.0,timestamp=3',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=0.8497145,load=270.0,timestamp=4',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=-0.3333637,load=270.0,timestamp=5',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=0.0906198,load=270.0,timestamp=6',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=-0.3867708,load=270.0,timestamp=7',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=-0.4381106,load=270.0,timestamp=8',
 'gear_metrics,metric=offline label=0,sr=97656.0,rate=25.0,gs=-0.7925295,load=270.0,timestamp=9']

In [None]:
'labeled_data, metric=classification timestamp=2020-03-26T21:14:48.527Z,label=0'

In [41]:
client.write_points(data, database='timeseriesdb', time_precision='ms', batch_size=100, protocol='line')

NameError: name 'data' is not defined

In [83]:
results = client.query('SELECT label, sr, rate, gs, load, ts FROM timeseriesdb.autogen.gear_metrics')

ConnectionError: HTTPConnectionPool(host='localhost', port=8086): Max retries exceeded with url: /query?q=SELECT+label%2C+sr%2C+rate%2C+gs%2C+load%2C+ts+FROM+timeseriesdb.autogen.gear_metrics&db=timeseriesdb (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x107e096d8>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [46]:
len(results.raw['series'][0]['values'])

291742

In [47]:
results.raw['series'][0]['values'][-10:]

[['2020-03-25T06:56:53.605Z', 0, 97656, 25, -0.3707241, '270.0', 268],
 ['2020-03-25T06:56:53.655Z', 0, 97656, 25, -0.2623347, '270.0', 269],
 ['2020-03-25T06:56:53.705Z', 0, 97656, 25, -0.2415872, '270.0', 270],
 ['2020-03-25T06:56:53.754Z', 0, 97656, 25, 0.1525302, '270.0', 271],
 ['2020-03-25T06:56:53.803Z', 0, 97656, 25, -0.1951451, '270.0', 272],
 ['2020-03-25T06:56:53.852Z', 0, 97656, 25, -0.04591288, '270.0', 273],
 ['2020-03-25T06:56:53.9Z', 0, 97656, 25, 0.7581841, '270.0', 274],
 ['2020-03-25T06:56:53.966Z', 0, 97656, 25, 1.201549, '270.0', 275],
 ['2020-03-25T06:56:54.019Z', 0, 97656, 25, 0.6854323, '270.0', 276],
 ['2020-03-25T06:56:54.071Z', 0, 97656, 25, -0.462476, '270.0', 277]]

In [None]:
for point in points:
    print("Time: {}, gs: {}".format(point['timestamp'], point['gs']))

## Offline Training
In this section we visualize the data ingested from the Java data generator. 

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
def create_csv():
	file = open('data/offline-train-BIG.txt','r')

	with open('data/data.csv','w') as fcsv:
		fcsv.write("Metric,Timestamp,Label,SR,Rate,GR,Load\n")
		for f in file:
			l= f.split(',')

			m = l[0].split(":")[1].split('"')[1]	
			#metric.append(m)

			t =l[1].split(":")[1]
			#timestamp.append(t)

			la =l[2].split(":")[1]
			#label.append(la)

			s = l[3].split(":")[1]
			#sr.append(s)

			r = l[4].split(":")[1]
			#rate.append(r)

			g = l[5].split(":")[1]
			#gr.append(g)

			lo = l[6].split(":")[1].split('"')[1]	
			#load.append(lo)
			fcsv.write(m+","+t+","+la+","+s+","+r+","+g+","+lo+"\n")

In [None]:
# Convert the JSON to CSV
create_csv()

In [None]:
# We will train the model offline in a py file

## Classify

In [7]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np

In [76]:
results = client.query(
        'SELECT  * FROM timeseriesdb.autogen.gear_metrics limit 1')

In [78]:
results.raw

{'statement_id': 0,
 'series': [{'name': 'gear_metrics',
   'columns': ['time', 'gs', 'label', 'load', 'metric', 'rate', 'sr', 'ts'],
   'values': [['2020-03-26T21:05:41.477Z',
     0.8407559,
     0,
     '270.0',
     'online',
     25,
     97656,
     0]]}]}

In [73]:
type(results.raw['series'][0]['values'][-1])

list

In [140]:
mins = 1

In [8]:
data_to_classify = results.raw['series'][0]['values'][0:10]

In [9]:
gear_data=pd.read_csv("data/data_predict.csv")
print(gear_data.head())

cols_to_norm = ['SR', 'GR','Load']

gear_data[cols_to_norm] = gear_data[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

sr = tf.feature_column.numeric_column('SR')
rate = tf.feature_column.numeric_column('Rate')
gr = tf.feature_column.numeric_column('GR')
load = tf.feature_column.numeric_column('Load')

print(type(gear_data))

    Metric  Timestamp  Label       SR  Rate        GR   Load
0  offline          0      0  97656.0  25.0  0.840756  270.0
1  offline          1      0  97656.0  25.0  0.515243  270.0
2  offline          2      0  97656.0  25.0 -0.038345  270.0
3  offline          3      0  97656.0  25.0  1.184862  270.0
4  offline          4      0  97656.0  25.0  0.849715  270.0
<class 'pandas.core.frame.DataFrame'>


In [117]:
query = "SELECT label, prediction FROM timeseriesdb.autogen.labeled_data"
print(query)

SELECT label, prediction FROM timeseriesdb.autogen.labeled_data


In [120]:
query = "SELECT label, prediction FROM timeseriesdb.autogen.labeled_data where time > '{}'".format(datetime.datetime.utcnow() - datetime.timedelta(minutes=5))
print(query)

SELECT label, prediction FROM timeseriesdb.autogen.labeled_data where time > '2020-03-27 22:54:54.825002'


In [121]:
classifications = client.query(query)

In [122]:
data = classifications.raw['series'][0]['values']

In [123]:
data

[['2020-03-27T22:54:57Z', 0, 0],
 ['2020-03-27T22:54:58Z', 0, 0],
 ['2020-03-27T22:55:08Z', 0, 0],
 ['2020-03-27T22:55:09Z', 0, 0],
 ['2020-03-27T22:55:19Z', 0, 0],
 ['2020-03-27T22:55:20Z', 0, 0],
 ['2020-03-27T22:55:31Z', 0, 0],
 ['2020-03-27T22:55:32Z', 0, 0],
 ['2020-03-27T22:55:42Z', 0, 0],
 ['2020-03-27T22:55:43Z', 0, 0],
 ['2020-03-27T22:55:54Z', 0, 0],
 ['2020-03-27T22:55:55Z', 0, 0],
 ['2020-03-27T22:56:05Z', 0, 0],
 ['2020-03-27T22:56:06Z', 0, 0],
 ['2020-03-27T22:56:16Z', 0, 0],
 ['2020-03-27T22:56:17Z', 0, 0],
 ['2020-03-27T22:56:27Z', 0, 0],
 ['2020-03-27T22:56:28Z', 0, 0],
 ['2020-03-27T22:56:39Z', 0, 0],
 ['2020-03-27T22:56:40Z', 0, 0],
 ['2020-03-27T22:56:50Z', 0, 0],
 ['2020-03-27T22:56:51Z', 0, 0],
 ['2020-03-27T22:57:02Z', 0, 0],
 ['2020-03-27T22:57:13Z', 0, 0],
 ['2020-03-27T22:57:14Z', 0, 0],
 ['2020-03-27T22:57:25Z', 0, 0],
 ['2020-03-27T22:57:36Z', 0, 0],
 ['2020-03-27T22:57:37Z', 0, 0],
 ['2020-03-27T22:57:48Z', 0, 0],
 ['2020-03-27T22:57:59Z', 0, 0],
 ['2020-03

In [94]:
data = np.array(data)
df = pd.DataFrame(data=data, columns=["time", "label", "prediction"])

In [53]:
df

Unnamed: 0,time,label,prediction
0,2020-03-27T21:50:29Z,0,0
1,2020-03-27T21:50:30Z,0,0
2,2020-03-27T21:50:31Z,0,0
3,2020-03-27T21:50:32Z,0,0
4,2020-03-27T21:50:33Z,0,0
5,2020-03-27T21:50:34Z,0,0
6,2020-03-27T21:50:35Z,0,0
7,2020-03-27T21:50:36Z,0,0
8,2020-03-27T21:50:37Z,0,0
9,2020-03-27T21:50:38Z,0,0


In [43]:
df = df.drop(['time'], axis=1)

In [49]:
labels = df["label"].to_numpy
predictions = df["prediction"].to_numpy

In [47]:
labels

<bound method IndexOpsMixin.to_numpy of 0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
305    0
306    0
307    0
308    0
309    0
310    0
311    0
312    0
313    0
314    0
315    0
316    0
317    0
318    0
319    0
320    0
321    0
322    0
323    0
324    0
325    0
326    0
327    0
328    0
329    0
330    0
331    0
332    0
333    0
334    0
Name: label, Length: 335, dtype: object>

In [156]:
    query = "SELECT  sr, gs, load, ts, label FROM timeseriesdb.autogen.gear_metrics where ts > 0"
    results = client.query(query)
    data = results.raw['series'][0]['values']
    data = np.array(data)
    df = pd.DataFrame(data=data, columns=[
        "TIME", "SR", "GR", "Load", 'ts', 'label'])


In [157]:
df

Unnamed: 0,TIME,SR,GR,Load,ts,label
0,2020-03-27T21:50:07.411Z,97656,0.0387017,270.0,1,0
1,2020-03-27T21:50:07.481Z,97656,0.4676445,270.0,2,0
2,2020-03-27T21:50:07.541Z,97656,0.6616452,270.0,3,0
3,2020-03-27T21:50:07.604Z,97656,1.770335,270.0,4,0
4,2020-03-27T21:50:07.705Z,97656,1.523985,270.0,5,0
5,2020-03-27T21:50:07.755Z,97656,1.916429,270.0,6,0
6,2020-03-27T21:50:07.806Z,97656,0.5592487,270.0,7,0
7,2020-03-27T21:50:07.871Z,97656,0.3270139,270.0,8,0
8,2020-03-27T21:50:07.923Z,97656,-1.082491,270.0,9,0
9,2020-03-27T21:50:07.972Z,97656,-0.9731784,270.0,10,0
