Importing Packages

In [2]:
import pandas as pd
import numpy as np
import math
from math import sqrt
from sklearn.metrics import *
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense,LSTM, Dropout
import tensorflow as tf
from datetime import datetime


Mounting Drive and importing data

In [3]:
from google.colab import drive
drive.mount('/content/drive')
!pip install --upgrade openpyxl
reanalysis_org = pd.read_excel('/content/drive/MyDrive/Dataset/reanalysis_Cleaned_new.xlsx')
df = reanalysis_org
df = pd.concat([df['Temperature'],df['Relative_Humidity'], df['Atmospheric_Pressure'],df['Measured_Windspeed']], axis=1)
reanalysis=df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The Long Short Term Memory Neural Network

In [4]:
def imputation(df):
    df = df.replace([9999.99],np.nan)    
    imp = SimpleImputer(strategy='mean')
    #df=df.dropna(axis=0)
    df_mat = imp.fit_transform(df.values)
    df = pd.DataFrame(df_mat, index=df.index, columns=df.columns)
    return df


def gen_time_series_df(dataset, lookback=1, prediction_col=3):
    temp_df=dataset.copy()
    temp_df["id"]= range(1, len(temp_df)+1)
    temp_df = temp_df.iloc[:-lookback, :]
    temp_df.set_index('id', inplace =True)
    pred_val=dataset.copy()
    pred_val = pred_val.iloc[lookback:,prediction_col]
    pred_val.columns=["Predcited"]
    pred_val= pd.DataFrame(pred_val)    
    pred_val["id"]= range(1, len(pred_val)+1)
    pred_val.set_index('id', inplace =True)
    df= pd.concat([temp_df, pred_val], axis=1)# ensuring prediction column is always -1 index
    return df


def scale_df(df):
    df_vals = df.values
    df_vals = df_vals.astype('float32')    
    normaliser = MinMaxScaler(feature_range=(0,1))#generating normalizing scaler
    df_scaled = normaliser.fit_transform(df_vals) #scaling data
    df = pd.DataFrame(df_scaled) 
    return df,normaliser


def train_test_split(df,training_size):
    df_vals = df.values
    train = df_vals[:int(len(df_vals)*training_size),:]
    test = df_vals[int(len(df_vals)*training_size):,:]
    X_train, Y_train = train[:, :-1], train[:, -1]
    X_test,Y_test = test[:, :-1], test[:, -1]
    #reshape input of LSTMN to be 3D= [samples,timesteps,features]
    X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
    return X_train,Y_train,X_test,Y_test


def build_model():
    LSTMN = Sequential()
    LSTMN.add(LSTM(20,activation='tanh', return_sequences=True,input_shape=(X_train.shape[1], X_train.shape[2])))
    LSTMN.add(LSTM(20, return_sequences=True))
    LSTMN.add(Dropout(0.2))
    LSTMN.add(LSTM(6))
    LSTMN.add(Dense(1))
    return LSTMN


def LSTMN_compile(LSTMN,X_train,Y_train,X_test,Y_test):
    # fit network
    LSTMN.compile(loss='mse', optimizer='adam')
    LSTMN.summary()
    LSTMN_history = LSTMN.fit(X_train, Y_train, epochs=10,validation_data=(X_test, Y_test), batch_size=64, shuffle=False)
    trainPredict=  LSTMN.predict(X_train)
    testPredict =  LSTMN.predict(X_test)    
    pyplot.plot(LSTMN_history.history['loss'], label='LSTMN train')
    pyplot.plot(LSTMN_history.history['val_loss'], label='LSTMN test')
    pyplot.legend()
    pyplot.xlabel('Epoch')
    pyplot.ylabel('Loss')
    pyplot.title('Loss Graph for the LSTMN')
    pyplot.show()
    Y_test.reshape(92048,1)# reshaping Y to original shape
    return trainPredict,testPredict,Y_test


def produce_stats_analysis(testPredict,Y_test):
    MSE = mean_squared_error(Y_test, testPredict)
    print(MSE)
    R2 = r2_score(Y_test, testPredict)
    print(R2)
    RMSE = sqrt(mean_squared_error(Y_test, testPredict))
    print(RMSE)
    MAE = mean_absolute_error(Y_test, testPredict)
    print(MAE)
    return

def produce_acc_pred_plt(normaliser):
    #visualization over full data
    X = np.concatenate([X_train,X_test])
    Y = np.concatenate([trainPredict,testPredict])
    X=X.reshape(131497,4)
    Y=Y.reshape(131497,1)
    data=np.hstack([X,Y])
    data = np.delete(data,-2,1)
    data = normaliser.inverse_transform(data)
    data = pd.DataFrame(data)
    predicted_windspeed = data.iloc[:,-1]
    actual_windspeed= reanalysis_org['Measured_Windspeed']

    #true and predicted windspeed plot 
    pyplot.plot(reanalysis_org['Datet'],actual_windspeed, label = 'Actual')
    pyplot.plot(reanalysis_org['Datet'][:-1],predicted_windspeed, label = 'Predicted',)
    pyplot.legend()
    pyplot.xlabel('Time')
    pyplot.ylabel('Predicted Wind Speed m/s')
    pyplot.title('Actual and Predicted Windspeed for the years 2002-2016')
    pyplot.show()

    #Prediction error plot
    pyplot.figure()
    reanalysis_imputed = imputation(reanalysis)
    pred_err = predicted_windspeed - reanalysis_imputed['Measured_Windspeed']
    pyplot.plot(reanalysis_org['Datet'],pred_err,color='tab:orange')
    pyplot.legend()
    pyplot.xlabel('Time')
    pyplot.ylabel('Predicted Wind Speed Error m/s')
    pyplot.title('Prediction Error in Windspeed for the years 2002-2016')
    pyplot.show()
    return predicted_windspeed


#Main loop
if __name__ == '__main__':
  df = reanalysis
  df = imputation(df)
  df,normaliser = scale_df(df)
  df = gen_time_series_df(df,1,3) # USAGE gen_time_series_df(dataset,lookback = number of time steps looking back on, prediction_col= what we want to predict)
  X_train,Y_train,X_test,Y_test = train_test_split(df,0.7)
  LSTMN= build_model()
  trainPredict,testPredict,Y_test = LSTMN_compile(LSTMN,X_train,Y_train,X_test,Y_test)
  produce_stats_analysis(testPredict,Y_test)
  predicted_windspeed = produce_acc_pred_plt(normaliser)

  #saving predicted wind speeds
  pred_csv= pd.DataFrame(predicted_windspeed).to_csv('predicted_windspeed_new.csv')
  !cp predicted_windspeed_new.csv "drive/My Drive/Dataset"


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 20)             2000      
                                                                 
 lstm_1 (LSTM)               (None, 1, 20)             3280      
                                                                 
 dropout (Dropout)           (None, 1, 20)             0         
                                                                 
 lstm_2 (LSTM)               (None, 6)                 648       
                                                                 
 dense (Dense)               (None, 1)                 7         
                                                                 
Total params: 5,935
Trainable params: 5,935
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: ignored