In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
import os
from plotly import graph_objs as go

In [None]:
def clean_data():
    """
    Read initial df and impute missing values with SimpleImputer
    Remove data before 2021-01-01, as there are too many values missing
    """
    df = pd.read_csv('https://gitlab.com/creativitylabb/data-lstm/-/raw/main/last_step_pagination_110422.csv', infer_datetime_format=True)

    df = df.drop('Unnamed: 0', axis=1)

    df['TimeStamp'] = pd.to_datetime(df['TimeStamp'], format="%Y-%m-%d %H:%M:%S")

    df.index = df['TimeStamp']
    df.index.sort_values()

    print(df.count())  

    # remove data before 2021
    df = df[df.index > '2021-01-01']

    # add missing days
    df = df.resample('D').mean()


    # check how many rows are missing
    def percentage(part, whole):
        return 100 * float(part) / float(whole)

    row_count = df.shape[0]

    for c in df.columns:
        m_count = df[c].isna().sum()

        if m_count > 0:
            print(f'{c} - {m_count} ({round(percentage(m_count, row_count), 3)}%) rows missing')

    df = df.drop(['LocationLat', 'LocationLong'], axis=1)


    # add 0 instead of NaN for pm2.5, pm1 and pm10 as their min is around 0
    # zero_columns = ['pm25', 'pm1', 'pm10']
    # for column in zero_columns:
    #     df[column] = df[column].fillna(0)
    # print('columns', df.columns)
    #todo check impute 0 or most frequent values for pm2.5, pm1 and pm10

    sensor_column_names = ['pm25', 'pm1', 'pm10', 'co2', 'o3', 'cho2', 'no2', 'so2']
    imp = SimpleImputer(strategy="most_frequent")
    df_mean_imputed = pd.DataFrame(imp.fit_transform(df.iloc[:, 0:]), columns=sensor_column_names)
    
    df_mean_imputed['TimeStamp'] = df.index
    df_mean_imputed.index = df.index

    df_mean_imputed['TimeStamp'] = df.index
    df_mean_imputed.index = df.index


    df_mean_imputed.to_csv('df_imputed_120422.csv')  
    return df_mean_imputed

In [None]:
df=clean_data() #impute data

TimeStamp       435
pm25            435
pm1             435
pm10            435
co2             425
o3              426
cho2            425
no2             424
so2             367
LocationLat     435
LocationLong    435
dtype: int64
pm25 - 33 (7.143%) rows missing
pm1 - 33 (7.143%) rows missing
pm10 - 33 (7.143%) rows missing
co2 - 39 (8.442%) rows missing
o3 - 38 (8.225%) rows missing
cho2 - 39 (8.442%) rows missing
no2 - 38 (8.225%) rows missing
so2 - 95 (20.563%) rows missing
LocationLat - 33 (7.143%) rows missing
LocationLong - 33 (7.143%) rows missing


In [None]:
df #imputed df

Unnamed: 0_level_0,pm25,pm1,pm10,co2,o3,cho2,no2,so2,TimeStamp
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-01-09,10.000000,3.000000,3.000000,433.508333,27.316667,6.8,6.280000,2.401389,2021-01-09
2021-01-10,1.000000,6.450079,6.000000,433.508333,27.316667,6.8,6.280000,2.401389,2021-01-10
2021-01-11,1.000000,1.000000,1.000000,433.508333,27.316667,6.8,6.280000,2.401389,2021-01-11
2021-01-12,1.000000,6.450079,6.000000,433.508333,27.316667,6.8,6.280000,2.401389,2021-01-12
2021-01-13,1.000000,6.450079,6.000000,433.508333,27.316667,6.8,6.280000,2.401389,2021-01-13
...,...,...,...,...,...,...,...,...,...
2022-04-11,12.367347,7.224490,12.067797,618.400000,32.828571,6.6,11.630000,2.366667,2022-04-11
2022-04-12,1.000000,6.450079,6.000000,433.508333,27.316667,6.8,6.280000,2.401389,2022-04-12
2022-04-13,1.000000,6.450079,6.000000,433.508333,27.316667,6.8,6.280000,2.401389,2022-04-13
2022-04-14,1.000000,6.450079,6.000000,433.508333,27.316667,6.8,6.280000,2.401389,2022-04-14


In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator

In [None]:
def create_model_lstm(df, column_index, sensor_name,epochs):
    scaler = MinMaxScaler() #scale data
    data_scaled = scaler.fit_transform(df)

    features = data_scaled  #pm25 pm1	pm10
    target = data_scaled[:, column_index]  # target sensor to be predicted
    # target = data_scaled[:, 0]  # pm25
    # target = data_scaled[:, 1]  # pm1
    # target = data_scaled[:, 2]  # pm10

    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=123,
                                                        shuffle=False)  
    df.index = pd.to_datetime(df.index)

    win_length = 2 
    batch_size = 128  

    num_features = len(df.columns)  # features used in model

    train_generator = TimeseriesGenerator(x_train, y_train, length=win_length, sampling_rate=1, batch_size=batch_size)
    test_generator = TimeseriesGenerator(x_test, y_test, length=win_length, sampling_rate=1, batch_size=batch_size)

    ################################
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.LSTM(150, input_shape=(win_length, num_features),
                                   return_sequences=True))  

    model.add(tf.keras.layers.LeakyReLU(alpha=0.5))

    model.add(tf.keras.layers.LSTM(50, return_sequences=True))

    model.add(tf.keras.layers.Dropout(0.2))  # make sure not overfit
    model.add(tf.keras.layers.LSTM(60, return_sequences=False))  

    model.add(tf.keras.layers.Dense(1))

    print(model.summary())

    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam(), metrics=["accuracy"])

    history = model.fit(train_generator, epochs=epochs, validation_data=test_generator, shuffle=False)

    model.evaluate_generator(test_generator, verbose=0)  # evaluate model with test data

    model.save('lstm_model_' + str(sensor_name) + '.h5')  # creates a HDF5 file


In [None]:
#prepare data for forecast
df.index = df['TimeStamp']
df = df.drop('TimeStamp', axis=1)

df = df[['pm25', 'pm1', 'pm10']]

In [None]:
df

Unnamed: 0_level_0,pm25,pm1,pm10
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-09,10.000000,3.000000,3.000000
2021-01-10,1.000000,6.450079,6.000000
2021-01-11,1.000000,1.000000,1.000000
2021-01-12,1.000000,6.450079,6.000000
2021-01-13,1.000000,6.450079,6.000000
...,...,...,...
2022-04-11,12.367347,7.224490,12.067797
2022-04-12,1.000000,6.450079,6.000000
2022-04-13,1.000000,6.450079,6.000000
2022-04-14,1.000000,6.450079,6.000000


In [None]:
create_model_lstm(df=df, column_index=0, sensor_name='pm25',epochs=650) #create pm2.5 model based on pm1 and pm10
create_model_lstm(df=df, column_index=1, sensor_name='pm1',epochs=650) #create pm1 model based on pm2.5 and pm10
create_model_lstm(df=df, column_index=2, sensor_name='pm10',epochs=650) #create pm10 model based on pm2.5 and pm10

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 2, 150)            92400     
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 2, 150)            0         
                                                                 
 lstm_1 (LSTM)               (None, 2, 50)             40200     
                                                                 
 dropout (Dropout)           (None, 2, 50)             0         
                                                                 
 lstm_2 (LSTM)               (None, 60)                26640     
                                                                 
 dense (Dense)               (None, 1)                 61        
                                                                 
Total params: 159,301
Trainable params: 159,301
Non-trai



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 2, 150)            92400     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 2, 150)            0         
                                                                 
 lstm_4 (LSTM)               (None, 2, 50)             40200     
                                                                 
 dropout_1 (Dropout)         (None, 2, 50)             0         
                                                                 
 lstm_5 (LSTM)               (None, 60)                26640     
                                                                 
 dense_1 (Dense)             (None, 1)                 61        
                                                                 
Total params: 159,301
Trainable params: 159,301
Non-tr



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 2, 150)            92400     
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 2, 150)            0         
                                                                 
 lstm_7 (LSTM)               (None, 2, 50)             40200     
                                                                 
 dropout_2 (Dropout)         (None, 2, 50)             0         
                                                                 
 lstm_8 (LSTM)               (None, 60)                26640     
                                                                 
 dense_2 (Dense)             (None, 1)                 61        
                                                                 
Total params: 159,301
Trainable params: 159,301
Non-tr



In [None]:
def plot_fb_data(result, y, yhat, ds):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=result[ds], y=result[y], name=y))
    fig.add_trace(go.Scatter(x=result[ds], y=result[yhat], name=yhat))
    fig.layout.update(title_text='Predicted vs Actual Observations', xaxis_rangeslider_visible=False)
    fig.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from keras.models import load_model


In [None]:
def load_model_lstm(df,selected_sensor):
    #load created model

  if selected_sensor == 'pm25':
      model = load_model('lstm_model_pm25.h5') #used to forecast pm2.5, based on pm2.5, pm1 and pm10
  elif selected_sensor == 'pm1':
      model = load_model('lstm_model_pm1.h5')
  elif selected_sensor == 'pm10':
      model = load_model('lstm_model_pm10.h5')


  df = df[['pm25', 'pm1', 'pm10']]

  scaler = MinMaxScaler()
  data_scaled = scaler.fit_transform(df)

  features = data_scaled  
  if selected_sensor == 'pm25':
      target = data_scaled[:, 0]  # pm25 
  elif selected_sensor == 'pm1':
      target = data_scaled[:, 1]  # pm1
  elif selected_sensor == 'pm10':
      target = data_scaled[:, 2]  # pm10 

  x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=123,
                                                      shuffle=False)  
  win_length = 2 
  batch_size = 128  
  test_generator = TimeseriesGenerator(x_test, y_test, length=win_length, sampling_rate=1,
                                      batch_size=batch_size)

  predictions = model.predict_generator(test_generator)

  df_pred = pd.concat([pd.DataFrame(predictions), pd.DataFrame(x_test[:, 1:][win_length:])], axis=1)

  rev_trans = scaler.inverse_transform(df_pred) #invert the scaling

  df_final = df[predictions.shape[0] * -1:]

  df_final['predicted ' + str(selected_sensor)] = rev_trans[:,
                                                  0]  # get only first column, pm10 predicted column


  df_final['TimeStamp'] = df_final.index


  print("DF FINAL\n ", df_final)

  plot_fb_data(df_final, selected_sensor, 'predicted ' + str(selected_sensor), 'TimeStamp')

  df_final['predicted-actual'] = df_final['predicted ' + str(selected_sensor)] - df_final[selected_sensor]

  mse = mean_squared_error(df_final[selected_sensor], df_final['predicted ' + str(selected_sensor)])  # not future
  r2 = r2_score(df_final[selected_sensor], df_final['predicted ' + str(selected_sensor)])
  mae = mean_absolute_error(df_final[selected_sensor], df_final['predicted ' + str(selected_sensor)])
  rmse = mean_squared_error(df_final[selected_sensor], df_final['predicted ' + str(selected_sensor)], squared=False)

  print("MSE ", mse)
  print("RMSE ", rmse)
  print("R2 ", r2)
  print("MAE ", mae)
  print("RMSE/MAE", rmse / mae)

In [None]:
load_model_lstm(df,'pm25')



df_final                   pm25        pm1       pm10  predicted pm25
TimeStamp                                                  
2022-01-15  14.515510   8.202363  15.942284       19.673467
2022-01-16  26.439210  15.942249  28.780206       14.861659
2022-01-17  19.188465   9.050772  24.675833       31.284773
2022-01-18  11.376648   6.633710  11.898477       18.208298
2022-01-19  23.786174  14.188103  25.068641       13.288697
...               ...        ...        ...             ...
2022-04-11  12.367347   7.224490  12.067797        7.393953
2022-04-12   1.000000   6.450079   6.000000       15.172952
2022-04-13   1.000000   6.450079   6.000000        5.851245
2022-04-14   1.000000   6.450079   6.000000        4.856701
2022-04-15  16.520000  10.386667  17.133333        4.856702

[91 rows x 4 columns]
DF FINAL
                   pm25        pm1       pm10  predicted pm25  TimeStamp
TimeStamp                                                             
2022-01-15  14.515510   8.202363  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


MSE  76.7113507713763
RMSE  8.75850162821109
R2  0.42813665555455405
MAE  6.472170967522509
RMSE/MAE 1.353255603438388




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
load_model_lstm(df,'pm1')


`Model.predict_generator` is deprecated and will be removed in a future version. Please use `Model.predict`, which supports generators.



df_final                   pm25        pm1       pm10  predicted pm1
TimeStamp                                                 
2022-01-15  14.515510   8.202363  15.942284      13.868327
2022-01-16  26.439210  15.942249  28.780206      10.615741
2022-01-17  19.188465   9.050772  24.675833      22.491564
2022-01-18  11.376648   6.633710  11.898477      12.736174
2022-01-19  23.786174  14.188103  25.068641       9.307597
...               ...        ...        ...            ...
2022-04-11  12.367347   7.224490  12.067797       8.906435
2022-04-12   1.000000   6.450079   6.000000      11.066638
2022-04-13   1.000000   6.450079   6.000000       6.315905
2022-04-14   1.000000   6.450079   6.000000       9.052395
2022-04-15  16.520000  10.386667  17.133333       9.052395

[91 rows x 4 columns]
DF FINAL
                   pm25        pm1       pm10  predicted pm1  TimeStamp
TimeStamp                                                            
2022-01-15  14.515510   8.202363  15.942284      



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



MSE  42.20437173657022
RMSE  6.496489185442412
R2  0.2304085837728761
MAE  4.609214566916212
RMSE/MAE 1.4094568805871144




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
load_model_lstm(df,'pm10')


`Model.predict_generator` is deprecated and will be removed in a future version. Please use `Model.predict`, which supports generators.



df_final                   pm25        pm1       pm10  predicted pm10
TimeStamp                                                  
2022-01-15  14.515510   8.202363  15.942284       19.931420
2022-01-16  26.439210  15.942249  28.780206       15.459327
2022-01-17  19.188465   9.050772  24.675833       30.726871
2022-01-18  11.376648   6.633710  11.898477       19.693064
2022-01-19  23.786174  14.188103  25.068641       14.976409
...               ...        ...        ...             ...
2022-04-11  12.367347   7.224490  12.067797       10.556253
2022-04-12   1.000000   6.450079   6.000000       15.924246
2022-04-13   1.000000   6.450079   6.000000        7.474972
2022-04-14   1.000000   6.450079   6.000000        8.971886
2022-04-15  16.520000  10.386667  17.133333        8.971886

[91 rows x 4 columns]
DF FINAL
                   pm25        pm1       pm10  predicted pm10  TimeStamp
TimeStamp                                                             
2022-01-15  14.515510   8.202363  



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



MSE  100.09660526596288
RMSE  10.004829097289113
R2  0.3453600338806855
MAE  7.1641751315923745
RMSE/MAE 1.396508169261539




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

