In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf , col, lag, datediff, unix_timestamp,lit,coalesce,concat,split, explode
from pyspark.sql.window import Window

In [4]:
schema_flow = StructType().add('Timestamp', TimestampType(), False) \
        .add('Ds_Reference', StringType(), False) \
        .add('Detector_Number', ShortType(), False) \
        .add('Traffic_Direction', ShortType(), False) \
        .add('Flow_In', ShortType(), False) \
        .add('Average_Speed', ShortType(), False) \
        .add('Sign_Aid_Det_Comms', ShortType(), False) \
        .add('Status', ShortType(), False) \
        .add('Legend_Group', ShortType(), False) \
        .add('Legend_Sign', ShortType(), False) \
        .add('Legend_SubSign', ShortType(), False) \
        .add('Protocol_Version', StringType(), False) 

In [5]:
df_raw = spark.read.csv('data/mcs_201606.csv', sep=';', schema=schema_flow, ignoreLeadingWhiteSpace=True, \
                    ignoreTrailingWhiteSpace=True, timestampFormat='yyyy-MM-dd HH:mm:ss.SSS')
df_raw.printSchema()

root
 |-- Timestamp: timestamp (nullable = true)
 |-- Ds_Reference: string (nullable = true)
 |-- Detector_Number: short (nullable = true)
 |-- Traffic_Direction: short (nullable = true)
 |-- Flow_In: short (nullable = true)
 |-- Average_Speed: short (nullable = true)
 |-- Sign_Aid_Det_Comms: short (nullable = true)
 |-- Status: short (nullable = true)
 |-- Legend_Group: short (nullable = true)
 |-- Legend_Sign: short (nullable = true)
 |-- Legend_SubSign: short (nullable = true)
 |-- Protocol_Version: string (nullable = true)



In [None]:
%%time
df_raw.count()

In [4]:
df_raw.show(5)

+-------------------+------------+---------------+-----------------+-------+-------------+------------------+------+------------+-----------+--------------+----------------+
|          Timestamp|Ds_Reference|Detector_Number|Traffic_Direction|Flow_In|Average_Speed|Sign_Aid_Det_Comms|Status|Legend_Group|Legend_Sign|Legend_SubSign|Protocol_Version|
+-------------------+------------+---------------+-----------------+-------+-------------+------------------+------+------------+-----------+--------------+----------------+
|2016-06-01 00:00:00| E182N 2,015|             49|               78|      0|          252|                 0|     1|         255|          1|             1|               4|
|2016-06-01 00:00:00| E182N 2,015|             50|               78|      0|          252|                 0|     1|         255|          1|             1|               4|
|2016-06-01 00:00:00| E182N 2,015|             51|               78|      0|          252|                 0|     1|         255| 

In [5]:
df_raw.select('Detector_Number','Timestamp','Ds_Reference','Average_Speed','Flow_In','Status').show(10)

+---------------+-------------------+------------+-------------+-------+------+
|Detector_Number|          Timestamp|Ds_Reference|Average_Speed|Flow_In|Status|
+---------------+-------------------+------------+-------------+-------+------+
|             49|2016-06-01 00:00:00| E182N 2,015|          252|      0|     1|
|             50|2016-06-01 00:00:00| E182N 2,015|          252|      0|     1|
|             51|2016-06-01 00:00:00| E182N 2,015|          252|      0|     1|
|             52|2016-06-01 00:00:00| E182N 2,015|          252|      0|     1|
|             49|2016-06-01 00:00:00| E182N 2,325|          252|      0|     1|
|             50|2016-06-01 00:00:00| E182N 2,325|          252|      0|     1|
|             51|2016-06-01 00:00:00| E182N 2,325|          252|      0|     1|
|             49|2016-06-01 00:00:00| E182N 2,690|          252|      0|     1|
|             50|2016-06-01 00:00:00| E182N 2,690|          252|      0|     1|
|             51|2016-06-01 00:00:00| E1

In [6]:
split_schema = StructType([
  StructField('Road', StringType(), False),
  StructField('Km_Ref', IntegerType(), False)
])


@udf(split_schema)
#@udf(StringType())
def split_ds_ref(s):
    try:
        r, km = s.split(' ')
        k, m = km.split(',')
        meter = int(k)*1000 + int(m)
        #var1[:] + meter 
        return r, meter
    except:
        return None
var1 = ''
#@udf(split_schema)
@udf(StringType())
def split_ds_ref2(s):
    try:
        r, km = s.split(' ')
        return r
    except:
        return None 
@udf(StringType())
def split_ds_ref3(s):
    try:
        r, km = s.split(' ')
        k, m = km.split(',')
        meter = int(k)*1000 + int(m)  
        return meter
    except:
        return None     
#def generate_sensor_ids1(*cols):
#    return concat(*[coalesce(c, lit("*")) for c in cols]) 
def generate_sensor_ids(s, d):
    r, km = s.split(' ')
    k, m = km.split(',')
    meter = int(k)*1000 + int(m)
    var1 = r, meter
    return var

funcConcatCols = udf(lambda x,y,z: x+'_'+y+'_'+z,StringType())
 

In [7]:
ascii_to_int = udf(lambda x : x - 48, ShortType())
df_cleanup1 = df_raw.withColumn('Detector_Number', ascii_to_int('Detector_Number'))


In [8]:
df_cleanup2 = df_cleanup1.withColumn('Ds_Ref_temp1',   split_ds_ref2('Ds_Reference')).withColumn('Ds_Ref_temp2',split_ds_ref3('Ds_Reference'))
df_cleanup3 = df_cleanup2.withColumn('Ds_Ref', funcConcatCols(col('Ds_Ref_temp1'), col('Ds_Ref_temp2'),col('Detector_Number').cast(StringType())))
df_cleanup3.show(2)

df_cleanup4 = df_cleanup3.withColumn('Ds_Reference',split_ds_ref('Ds_Reference'))
df_cleanup4.show(2)

+-------------------+------------+---------------+-----------------+-------+-------------+------------------+------+------------+-----------+--------------+----------------+------------+------------+------------+
|          Timestamp|Ds_Reference|Detector_Number|Traffic_Direction|Flow_In|Average_Speed|Sign_Aid_Det_Comms|Status|Legend_Group|Legend_Sign|Legend_SubSign|Protocol_Version|Ds_Ref_temp1|Ds_Ref_temp2|      Ds_Ref|
+-------------------+------------+---------------+-----------------+-------+-------------+------------------+------+------------+-----------+--------------+----------------+------------+------------+------------+
|2016-06-01 00:00:00| E182N 2,015|              1|               78|      0|          252|                 0|     1|         255|          1|             1|               4|       E182N|        2015|E182N_2015_1|
|2016-06-01 00:00:00| E182N 2,015|              2|               78|      0|          252|                 0|     1|         255|          1|       

In [None]:
#import pandas as pd
#import matplotlib
#import matplotlib.pyplot as plt

In [None]:
# plot data
#speed_histogram = df_cleanup2.select('Status').rdd.flatMap(lambda x: x).histogram(5)
#speed_histogram

In [None]:
#pd.DataFrame(list(zip(list(speed_histogram)[0], list(speed_histogram)[1])), \
#             columns=['Status','Status Distibution']).set_index('Status').plot(kind='bar')

In [None]:
df_cleanup4.write.save('data/trafficData_E4N.parquet', format='parquet')

In [10]:
%%time
df_trafficData_E4N = spark.read.parquet('data/trafficData_E4N.parquet').select('Timestamp', 'Ds_Reference', 'Ds_Ref', 'Detector_Number', 'Flow_In', 'Average_Speed').where('Status == 3 AND Ds_Reference.Road == "E4N"')

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 1.52 s


In [11]:
df_trafficData_E4N.createOrReplaceTempView("NormalTrafficFlow")

In [11]:
df_trafficData_E4N.show(10)

+-------------------+------------+-----------+---------------+-------+-------------+
|          Timestamp|Ds_Reference|     Ds_Ref|Detector_Number|Flow_In|Average_Speed|
+-------------------+------------+-----------+---------------+-------+-------------+
|2016-06-09 01:45:00|[E4N, 47465]|E4N_47465_2|              2|      4|          101|
|2016-06-09 01:45:00|[E4N, 47465]|E4N_47465_3|              3|      7|           84|
|2016-06-09 01:45:00|[E4N, 47800]|E4N_47800_2|              2|      4|           98|
|2016-06-09 01:45:00|[E4N, 47800]|E4N_47800_3|              3|      7|           91|
|2016-06-09 01:45:00|[E4N, 48290]|E4N_48290_2|              2|      2|          104|
|2016-06-09 01:45:00|[E4N, 48290]|E4N_48290_3|              3|      4|           97|
|2016-06-09 01:45:00|[E4N, 48620]|E4N_48620_2|              2|      2|          119|
|2016-06-09 01:45:00|[E4N, 48620]|E4N_48620_3|              3|      3|          105|
|2016-06-09 01:45:00|[E4N, 48935]|E4N_48935_2|              2|   

In [12]:
# Add Density Column
df2_trafficData_E4N = df_trafficData_E4N.withColumn('Density', col('Flow_In')*60/col('Average_Speed'))

w = Window.partitionBy('Ds_Reference', 'Detector_Number').orderBy('Timestamp')
time_diff = unix_timestamp('Timestamp', format='yyyy-MM-dd HH:mm:ss.SSS') - lag(unix_timestamp('Timestamp', format='yyyy-MM-dd HH:mm:ss.SSS')).over(w)                            
df3_trafficData_E4N = df2_trafficData_E4N.withColumn('Time_Lag_Length', time_diff).filter(col('Time_Lag_Length') == 60)
df3_trafficData_E4N.show(20)

+-------------------+------------+-----------+---------------+-------+-------------+------------------+---------------+
|          Timestamp|Ds_Reference|     Ds_Ref|Detector_Number|Flow_In|Average_Speed|           Density|Time_Lag_Length|
+-------------------+------------+-----------+---------------+-------+-------------+------------------+---------------+
|2016-06-01 03:03:00|[E4N, 30710]|E4N_30710_2|              2|     18|          102|10.588235294117647|             60|
|2016-06-01 03:04:00|[E4N, 30710]|E4N_30710_2|              2|     24|           96|              15.0|             60|
|2016-06-01 03:05:00|[E4N, 30710]|E4N_30710_2|              2|     15|           97| 9.278350515463918|             60|
|2016-06-01 03:06:00|[E4N, 30710]|E4N_30710_2|              2|      8|          105| 4.571428571428571|             60|
|2016-06-01 03:07:00|[E4N, 30710]|E4N_30710_2|              2|     10|          100|               6.0|             60|
|2016-06-01 03:08:00|[E4N, 30710]|E4N_30

In [23]:
df_diff.show(20)

+-------------------+------------+-----------+---------------+-------+-------------+------------------+----------------+
|          Timestamp|Ds_Reference|     Ds_Ref|Detector_Number|Flow_In|Average_Speed|           Density|Time_Lag_Length |
+-------------------+------------+-----------+---------------+-------+-------------+------------------+----------------+
|2016-06-01 00:48:00|[E4N, 30710]|E4N_30710_2|              2|      1|           28| 2.142857142857143|            null|
|2016-06-01 02:27:00|[E4N, 30710]|E4N_30710_2|              2|      1|           36|1.6666666666666667|            5940|
|2016-06-01 02:33:00|[E4N, 30710]|E4N_30710_2|              2|      1|           99|0.6060606060606061|             360|
|2016-06-01 03:02:00|[E4N, 30710]|E4N_30710_2|              2|     11|          113|  5.84070796460177|            1740|
|2016-06-01 03:03:00|[E4N, 30710]|E4N_30710_2|              2|     18|          102|10.588235294117647|              60|
|2016-06-01 03:04:00|[E4N, 30710

In [14]:
#df4_trafficData_E4N = df3_trafficData_E4N.drop("Ds_Reference")
df4_trafficData_E4N = df3_trafficData_E4N.select('Timestamp', 'Density', 'Ds_Ref', 'Detector_Number', 'Flow_In', 'Average_Speed').where('Status == 3 AND Ds_Reference.Road == "E4N"')
df4_trafficData_E4N.show(2)

+-------------------+------------------+-----------+---------------+-------+-------------+
|          Timestamp|           Density|     Ds_Ref|Detector_Number|Flow_In|Average_Speed|
+-------------------+------------------+-----------+---------------+-------+-------------+
|2016-06-01 03:03:00|10.588235294117647|E4N_30710_2|              2|     18|          102|
|2016-06-01 03:04:00|              15.0|E4N_30710_2|              2|     24|           96|
+-------------------+------------------+-----------+---------------+-------+-------------+
only showing top 2 rows



In [None]:
#df4_trafficData_E4N.coalesce(1).write.format("com.databricks.spark.csv").save("data/df4_trafficData_E4N.csv") memory error

In [None]:
import pandas as pd
df4_trafficData_E4N.write.save('data/df4_trafficData_E4N.parquet', format='parquet')
df = pd.read_parquet('data/df4_trafficData_E4N.parquet')


In [18]:
df.to_csv('df4_trafficData_E4N.csv')

In [6]:
from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from pandas import datetime
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.regularizers import L1L2
from math import sqrt
import matplotlib
import numpy as np
from keras import regularizers
# frame a sequence as a supervised learning problem
def timeseries_to_supervised(data, lag=1):
    df = DataFrame(data)
    columns = [df.shift(i) for i in range(1, lag+1)]
    columns.append(df)
    df = concat(columns, axis=1)
    return df
 
# create a differenced series
def getDifference(dataset, interval=2):
    density_diff = list() 
    for i in range(interval, len(dataset)):
        density_value = dataset[i] - dataset[i - interval]
        density_diff.append(density_value)
    return Series(density_diff)
 
# Reverse values
def reverse_density_difference(history, yhat, interval=1):
    return yhat + history[-interval]
 
# scale data to [-1, 1]
def scaleData(train, test):
    # fit scaler
    min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
    min_max_scaler = min_max_scaler.fit(train)
    # transform train
    # train[train_indices]
    train = train.values.reshape(train.shape[0],train.shape[1])
    
    #train = train.reshape(train.shape[0], train.shape[1])
    train_min_max_scaled_data = min_max_scaler.transform(train)
    # transform test
    test = test.values.reshape(test.shape[0],test.shape[1])
    # test = test.reshape(test.shape[0], test.shape[1])
    test_min_max_scaled_data = min_max_scaler.transform(test)
    return min_max_scaler, train_min_max_scaled_data, test_min_max_scaled_data
 
# Reverse scaling for predicted values
def reverse_scale(scaler, X, yhat):
    create_row = [x for x in X] + [yhat]
    array = numpy.array(create_row)
    array = array.reshape(1, len(array))
    inverted = scaler.inverse_transform(array)
    return inverted[0, -1]
from keras.optimizers import Adam
# fit an LSTM network to training data
def fit_lstm(train, batch_size, epochs, neurons,elasticnet_regularizer):
    X, y = train[:, 0:-1], train[:, -1]
    X = X.reshape(X.shape[0], 1, X.shape[1])
    model = Sequential()
    # model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True,recurrent_regularizer=elasticnet_regularizer))
    adam = Adam(lr=0.01, clipvalue=0.5)
    model.add(LSTM(8, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), return_sequences=True,
                   kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01)))
    model.add(LSTM(4, activation = 'hard_sigmoid', inner_activation = 'hard_sigmoid')) #return_sequences = True

    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    print(model.summary())
    for i in range(epochs):
        model.fit(X, y, epochs=1, batch_size=batch_size, verbose=1, shuffle=False)
        model.reset_states()
    return model
 

In [2]:

from sklearn.model_selection import train_test_split
# run recurrent experiment
def experiment(traffic_series, time_lag, recurrence, epochs, batch_size, neurons,elasticnet_regularizer):
    # Convert time series to stationary data
    
    raw_density_values = traffic_series.values
    differenced_density_values = getDifference(raw_density_values, 1)
    
    # Convert time series to supervised learning, prediction at previous timestep to be used current time step
    supervised_data = timeseries_to_supervised(differenced_density_values, time_lag)
   
    # split data into train and test-sets
    train, test = train_test_split(supervised_data, train_size=0.8)
    
    # transform the scale of the data
    scaler, train_scaled, test_scaled = scaleData(train, test)
    
    # run experiment
    error_scores = list()
    for r in range(recurrence):
        # fit the model
        train_trimmed = train_scaled[2:, :]
        model = fit_lstm(train_trimmed, batch_size, epochs, neurons,elasticnet_regularizer)
        
        # make prediction on test dataset
        test_reshaped = test_scaled[:,0:-1]
        test_reshaped = test_reshaped.reshape(len(test_reshaped), 1, 1)
        output = model.predict(test_reshaped, batch_size=batch_size)
        predictions = list()
        # Reverse to original scale before calculating prediction error
        for i in range(len(output)):
            yhat = output[i,0]
            X_input = test_scaled[i, 0:-1]
            # Reverse scaling
            yhat = reverse_scale(scaler, X_input, yhat)
            # Reverse differencing density
            yhat = reverse_density_difference(raw_density_values, yhat, len(test_scaled)+1-i)
            # Save predictions
            predictions.append(yhat)
            # report performance
        rmse = sqrt(mean_squared_error(raw_density_values[:], predictions))
        print('%d) Test RMSE: %.3f' % (r+1, rmse))
        error_scores.append(rmse)
    return error_scores


In [3]:

def time_stamp_parser(time_stamp):
    return datetime.strptime(time_stamp, '%Y-%m-%d %H:%M:%S')

traffic_series = read_csv('data/df4_trafficData_E4N.csv', header=0, parse_dates=[1],
                  squeeze=True, decimal=',', date_parser=time_stamp_parser)

traffic_series.head()
traffic_series.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15156387 entries, 0 to 15156386
Data columns (total 7 columns):
Unnamed: 0         int64
Timestamp          datetime64[ns]
Density            object
Ds_Ref             object
Detector_Number    int64
Flow_In            int64
Average_Speed      int64
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 809.4+ MB


In [4]:
traffic_series1 = traffic_series.Density.astype(float)
traffic_series1.head()

0    10.588235
1    15.000000
2     9.278351
3     4.571429
4     6.000000
Name: Density, dtype: float64

In [None]:
import pandas as pd
from pandas import read_csv
#from pandas import datetime
from datetime import datetime
# configure the experiment
def run():
    # load dataset
    #traffic_series = traffic_series1
    #units=128
    # configure the experiment
    time_lag = 1
    runExperiments = 30  # TODO: Run Experiment according to number of sensors  
    epochs = 2 #1000
    batch_size = 6
    neurons = 50
    elasticnet_regularizer = L1L2(l1=0.01, l2=0.01)
    # run the experiment
    results = DataFrame()
    results['results'] = experiment(traffic_series1, time_lag, runExperiments, epochs, batch_size, neurons,elasticnet_regularizer)
    # summarize results
    print(results.describe())
    
run() 



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (6, 1, 8)                 320       
_________________________________________________________________
lstm_2 (LSTM)                (6, 4)                    208       
_________________________________________________________________
dense_1 (Dense)              (6, 1)                    5         
Total params: 533
Trainable params: 533
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1

In [6]:
traffic_series1 = traffic_series.Density.astype(float)
traffic_series1.head()

0    10.588235
1    15.000000
2     9.278351
3     4.571429
4     6.000000
Name: Density, dtype: float64

In [7]:
import pandas as pd
from pandas import read_csv
#from pandas import datetime
from datetime import datetime
# configure the experiment
def run():
    # load dataset
    #traffic_series = traffic_series1
    #units=128
    # configure the experiment
    time_lag = 1
    runExperiments = 30  # TODO: Run Experiment according to number of sensors  
    epochs = 10 #1000
    batch_size = 32077
    neurons = 50
    elasticnet_regularizer = L1L2(l1=0.01, l2=0.01)
    # run the experiment
    results = DataFrame()
    results['results'] = experiment(traffic_series1, time_lag, runExperiments, epochs, batch_size, neurons,elasticnet_regularizer)
    # summarize results
    print(results.describe())
    
run() 



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (32077, 50)               10400     
_________________________________________________________________
dense_1 (Dense)              (32077, 1)                51        
Total params: 10,451
Trainable params: 10,451
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


ValueError: In a stateful network, you should only pass inputs with a number of samples that can be divided by the batch size. Found: 3031278 samples. Batch size: 32077.

In [3]:
#def time_stamp_parser(time_stamp):
#    return datetime.strptime(time_stamp, '%Y-%m-%d %H:%M:%S')

#traffic_series = read_csv('data/df4_trafficData_E4N.csv', header=0, parse_dates=[1],
#                  squeeze=True, decimal=',', date_parser=time_stamp_parser)

#traffic_series.head()
#traffic_series.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15156387 entries, 0 to 15156386
Data columns (total 7 columns):
Unnamed: 0         int64
Timestamp          datetime64[ns]
Density            object
Ds_Ref             object
Detector_Number    int64
Flow_In            int64
Average_Speed      int64
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 809.4+ MB




                  0          0
13916465  -2.078049  -0.506567
9753022    3.170732  -8.000000
5887555    4.198758  -3.630847
10175463  -5.316456   2.488341
7027018    2.938053  -1.721519
5193790   -2.666667   4.802260
12840706  -3.166667   7.500000
6885462   -1.522118  -0.083857
5038497   -0.097371  -4.518014
8144673    1.212121  -0.599251
12712583  -4.567757   5.339034
7397697   -0.666667   1.926740
11175574   0.865385   0.157343
7516057   -5.597561  -4.147059
9317225    6.610797  -4.542569
15101068  -4.550000   2.234848
6278627   -0.025497   0.030888
13660861 -15.353535  22.091503
11600672   0.000000   0.825200
4385440    5.521669  -6.103896
2404482   -2.684932   3.052632
5933896   -6.095662   5.806452
2462454   -2.103387   5.426945
15091987   9.647059 -20.804954
2749573    0.260445  -1.344450
121336    -0.385488   0.539503
11087262  -6.251217  -5.226516
5982344   -0.453782  -1.100213
1082379    0.090909   6.309091
3432260    0.838235   5.969178
...             ...        ...
4619816 

AttributeError: 'DataFrame' object has no attribute 'reshape'

In [None]:

densDiff = col('Density')- lag('Density', 1).over(w)
 
time_diff = unix_timestamp('Timestamp', format='yyyy-MM-dd HH:mm:ss.SSS') - lag(unix_timestamp('Timestamp', format=timeFmt)).over(w)
                              
df_diff = df_trafficData_E4N.withColumn('Density_Diff', densDiff).withColumn('timeDiff', timeDiff)
df_diff.show(20)

In [None]:
import numpy as np
sensor_ids = spark.sql('select distinct Ds_Ref from NormalTrafficFlow').rdd.map(lambda row: row).collect()
sensor_ids

In [None]:
import numpy as np
sendor_ids = np.array(spark.sql('select distinct Ds_Ref from NormalTrafficFlow').rdd.map(lambda row: row[0]).collect())
sendor_ids

In [None]:
file_ids = spark.sql('select distinct _c1 from NormalTrafficFlow').rdd.map(lambda row: row._c1).collect()

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import Callback
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation
import time


In [None]:
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

In [None]:
timesteps = 100
dim = 2
lossHistory = LossHistory()
# design network

model = Sequential()
model.add(LSTM(50,input_shape=(timesteps,dim),return_sequences=True))
model.add(Dense(2))
model.compile(loss='mae', optimizer='adam')

def train(data):
    model.fit(data, data, epochs=20, batch_size=72, validation_data=(data, data), verbose=1, shuffle=False,callbacks=[lossHistory])

def score(data):
    yhat =  model.predict(data)
    return yhat

In [9]:
df5_trafficData_E4N = spark.read.parquet('data/df4_trafficData_E4N.parquet').select('Timestamp', 'Density')
df5_trafficData_E4N.createOrReplaceTempView("TransformedNormalTrafficFlow")

In [None]:
import numpy as np

def create_trimmed_recording():
    density_series = np.array(df5_trafficData_E4N.select('Timestamp', 'Density').rdd.map(lambda row: np.array([row.Timestamp,row.Density])).collect())
    print(density_series)
    samples = len(density_series)
    trimmed_samples_of_100s = samples % 100
    trimmed_density_series = density_series[:samples-trimmed_samples_of_100s]
    recording_trimmed.shape = (samples//timesteps,timesteps,dim)
    return recording_trimmed
print(create_trimmed_recording())



In [None]:

def create_trimmed_recording(df,file_id):
    recording = np.array(df.orderBy(df['Timestamp']).where(df['Ds_Ref'] == file_id).select('Density','Density_Diff').rdd.map(lambda row: np.array([row.Density,row.Density_Diff])).collect())  
    
    samples = len(recording)
    print(samples)
    trim = samples % 100
    recording_trimmed = recording[:samples-trim]
    print(recording_trimmed.shape)
    recording_trimmed.shape = (samples//timesteps,timesteps,dim)
    return recording_trimmed

In [None]:
#sensor_ids = spark.sql('select distinct _c1 from df_healthy').rdd.map(lambda row: row._c1).collect()
start = time.time()
for sensor_id in sendor_ids:
    print(sensor_id)
    recording_trimmed = create_trimmed_recording(df_diff,sensor_id)
    print ("Staring training on:" + str(sensor_id))
    train(recording_trimmed)
    print ("Finished training on" + str(sensor_id) + " after " + str(time.time()-start) + " seconds")

print ("Finished job on after " + str(time.time()-start) + " seconds")
healthy_losses = lossHistory.losses

In [None]:
df_E4N = spark.sql('SELECT Timestamp, Ds_Reference,Ds_Ref, Detector_Number, Flow_In, Average_Speed ' 
                  'FROM NormalTrafficFlow WHERE Status == 3 AND Ds_Reference.Road == "E4N"')

In [None]:
#newDF = df_E4N.select("Ds_Reference", "Ds_Reference.*");


In [None]:
#sensor_ids = spark.sql('select Ds_Reference from NormalTrafficFlow').rdd.map(lambda x : x.Ds_Reference ).collect()
#sensor_ids

In [None]:
#df_E4N.count()
df_E4N.show(10)

In [None]:
# Add Density Column
df_E4N_D = df_E4N.withColumn('Density', col('Flow_In')*60/col('Average_Speed'))
df_E4N_D.show(20)

In [None]:

w = Window.partitionBy('Ds_Reference', 'Detector_Number').orderBy('Timestamp')
densDiff = col('Density')- lag('Density', 1).over(w)
timeFmt = 'yyyy-MM-dd HH:mm:ss.SSS'
timeDiff = unix_timestamp('Timestamp', format=timeFmt) - lag(unix_timestamp('Timestamp', format=timeFmt)).over(w)
                              
df_diff = df_E4N_D.withColumn('Density_Diff', densDiff).withColumn('timeDiff', timeDiff)
df_diff.show(20)


In [None]:
import numpy as np
#sensor_ids = np.array(df_diff.select('Ds_Ref').collect())
#file_ids = spark.sql('select distinct _c1 from df_healthy').rdd.map(lambda row: row._c1).collect()
sensor_ids = df_diff.select("Ds_Ref").rdd.flatMap(lambda row : row).collect()

In [None]:
#sensor_ids = spark.sql('select  distinct Ds_Ref from NormalTrafficFlow').rdd.map(lambda x : x._3).collect()
sensor_ids = df_diff.select("Ds_Ref").rdd.flatMap(lambda x: x).collect()

In [None]:
sensor_ids = np.array(df_diff.select('Ds_Ref').rdd.map(lambda row: np.array([row[0]])).collect())

In [None]:
import numpy as np
#sensor_ids = np.array(df_diff.select('Ds_Ref').collect())
#file_ids = spark.sql('select distinct _c1 from df_healthy').rdd.map(lambda row: row._c1).collect()
sensor_ids = df_diff.select("Ds_Ref").rdd.flatMap(lambda row : row).collect()

In [None]:
sensor_ids

In [None]:
sensor_ids = df_diff.select("Ds_Ref").rdd.map(lambda row : row).collect()

In [None]:
def create_trimmed_recording(df,file_id):
    recording = np.array(df.df_diff(df['_c0']).where(df['_c1'] == file_id).select('_c2','_c3').rdd.map(lambda row: np.array([row._c2,row._c3])).collect())  
    samples = len(recording)
    #Ds_Ref
    trim = samples % 100
    recording_trimmed = recording[:samples-trim]
    recording_trimmed.shape = (samples//timesteps,timesteps,dim)
    return recording_trimmed

In [None]:
for sensor_id from sensor_ids:
    trimmed_recording = create_trimmed_record(df_diff, sensor_id)
        recording_trimmed = create_trimmed_recording(df_healthy_read_parquet,file_id)
    print ("Staring training on:" + str(file_id))
    
    train(recording_trimmed)
    print ("Finished training on" + str(file_id) + " after " + str(time.time()-start) + " seconds")

print ("Finished job on after " + str(time.time()-start) + " seconds")
healthy_losses = lossHistory.losses

In [None]:
timesteps = 100
dim = 2
lossHistory = LossHistory()
# design network

model = Sequential()
model.add(LSTM(50,input_shape=(timesteps,dim),return_sequences=True))
model.add(Dense(2))
model.compile(loss='mae', optimizer='adam')

def train(data):
    model.fit(data, data, epochs=20, batch_size=72, validation_data=(data, data), verbose=1, shuffle=False,callbacks=[lossHistory])

def score(data):
    yhat =  model.predict(data)
    return yhat

In [None]:
df_E4N = spark.sql('SELECT Timestamp, Ds_Reference, Detector_Number, Flow_In, Average_Speed ' 
                  'FROM NormalTrafficFlow WHERE Status == 3 AND Ds_Reference.Road == "E4N"')

sensor_ids = spark.sql('select distinct Sensor_ID from df_diff').rdd.map(lambda x : x.Sensor_ID ).collect()


In [None]:
df_raw.withColumn('Ds_Reference', split_ds_ref('Ds_Reference'))

In [None]:
#%%spark -o hist_df
hist_df = spark.createDataFrame(list(zip(list(speed_histogram)[0], list(speed_histogram)[1])), \
             schema=['Status','Status distribution'])

In [None]:
hist_df.set_index('Status').plot(kind='bar')

In [None]:
df_cleanup2.createOrReplaceTempView("FlowData")
df_E4N = spark.sql('SELECT Timestamp, Ds_Reference, Detector_Number, Flow_In, Average_Speed ' 
                  'FROM FlowData WHERE Status == 3 ')
df_E4N = spark.sql('SELECT Timestamp, Ds_Reference, Detector_Number, Flow_In, Average_Speed ' 
                  'FROM FlowData WHERE Status == 1 ')
df_E4N = spark.sql('SELECT Timestamp, Ds_Reference, Detector_Number, Flow_In, Average_Speed ' 
                  'FROM FlowData WHERE Status == 2 ')