## Persistence model with Kafka and Spark streaming 

This notebook provides an example of a persistent model on streaming data coming from a Kafka producer. 

This notebook uses 
* the [Python client for the Apache Kafka distributed stream processing system](http://kafka-python.readthedocs.io/en/master/index.html) to receive messages from a Kafka cluster. 
* [Spark streaming](https://spark.apache.org/docs/latest/streaming-programming-guide.html) for processing the streaming data


### General import

In [1]:
import time
import re, ast
import numpy as np
import os
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

### Start Spark session


In [2]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[*] pyspark-shell'

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("PersistenceReceive") \
    .getOrCreate()

### Connect to Kafka server on topic persistence

In [3]:
#This function creates a connection to a Kafka stream
#You may change the topic, or batch interval
#The Zookeeper server is assumed to be running at 127.0.0.1:2181
#The function returns the Spark context, Spark streaming context, and DStream object
def getKafkaDStream(spark,topic='persistence',batch_interval=10):

    #Get Spark context
    sc=spark.sparkContext

    #Create streaming context, with required batch interval
    ssc = StreamingContext(sc, batch_interval)

    #Checkpointing needed for stateful transforms
    ssc.checkpoint("checkpoint")
    
    #Create a DStream that represents streaming data from Kafka, for the required topic 
    dstream = KafkaUtils.createStream(ssc, "zoo1:2181,zoo2:2181,zoo3:2181", "persistence", {topic: 1})
    
    return [sc,ssc,dstream]


In [4]:
def predictDay(data):
    global state,day
    new_values = data.collect()
    if len(new_values) > 0:
        sensorsToPredict = state[1]
        array_values=np.array(data)
        predictions = dict()
        truths = dict()
        seconds = dict()
        last_temperature = dict()
        for sensor in sensorsToPredict:
            last_temperature[sensor] = state[0][sensor]
            truths[sensor] = []
            predictions[sensor] = []
            seconds[sensor] = []
        for i,array_ in new_values:
            if i != array_[2]:
                print(False)
            if array_[2] in sensorToPredict:
                s = array_[2]
                truths[s].append(array_[0])
                seconds[s].append(array_[1])
                predictions[s].append(last_temperature[s])
        
        for sensor in sensorsToPredict:
            last_temperature[s] = truths[sensor][-1]
            MSE=np.mean((np.array(truths[sensor])-np.array(predictions[sensor]))**2)
            print(day,sensor,MSE)

            fig1,ax1 = plt.subplots()
            ax1.scatter(seconds[sensor],predictions[sensor],label='prediction',marker='.', alpha=0.5)#, linestyle='None')
            ax1.scatter(seconds[sensor],truths[sensor],label='truth',marker='.', alpha=0.5)#, linestyle='None')
            plt.title('Temperature for sensor {0} on day {1}'.format(sensor,day))
            plt.ylabel('Temperature in °C')
            plt.xlabel('Time t in seconds')
            plt.legend()
#             plt.show()
            plt.savefig('sensor_{0}_day_{1}.pdf'.format(sensor,day))
        day+=1
        state = [last_temperature,sensorsToPredict]
                
                

### Define streaming pipeline

* We define one state, which is a list of two elements:
    * The last measurement
    * The output of predictions for sensor 1 for day 8
* We create a DStream, flat map with the sensor ID as key, update state for the stream, and save MSE

In [5]:
#Helper functions

#Print number of partitions and number of records for an RDD
def printInfoRDD(rdd):
    if rdd is not None:
        print("The RDD has "+str(rdd.getNumPartitions())+" partitions")
        print("The RDD has "+str(rdd.count())+" elements")
    else:
        print("No info to provide")
        
#Save state in global Python variable
def saveState(rdd):
    global state_global
    if rdd is not None:
        print('save')
        data=rdd.collect()
#         print(data)
        state_global=data


In [None]:
#Initial state
last_measurement={1:0,24:0}
sensorToPredict=[1,24]
output_day8=None
day = 1
state=[last_measurement,sensorToPredict]

#Batch interval (to be synchronized with KafkaSend)
interval=10

#This variable is used to retrieve state data (through saveState function)
state_global=None

#Create dtsream
[sc,ssc,dstream]=getKafkaDStream(spark=spark,topic='persistence',batch_interval=interval)

#Evaluate string content (a list) and cast as float value

dstream = dstream.map(lambda x: np.array(ast.literal_eval(x[1])))
#Use this for debugging

#Group by sensor id. x[2] is here the sensorId (for example '1'), and x are the sensor measurement, seconds, sensorId and sensor type)
dstream=dstream.flatMap(lambda x: [(x[2],x)])

dstream.foreachRDD(predictDay)

### Start streaming application

In [None]:
#For synchronization with receiver (for the sake of the simulation), starts at a number of seconds multiple of five
current_time=time.time()
time_to_wait=interval-current_time%interval
time.sleep(time_to_wait)

ssc.start()
ssc.awaitTermination()

1 1 404.496782137
1 24 411.965460532
2 1 469.207844804
2 24 50.0523572009
3 1 10.0465324233
3 24 64.218765491
4 1 8.75933262154
4 24 89.2870464723
5 1 10.6557572088
5 24 75.6930710504
6 1 6.90785660112
6 24 25.0309479998
7 1 14.2250965906
7 24 19.1663016444
8 1 14.6868126419
8 24 27.6863858409
