In [None]:
import tensorflow as tf
from tensorflow import keras 
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Data Set

In [None]:
station_day = pd.read_csv("/kaggle/input/air-quality-prediction/station_day.csv")

In [None]:
station_day.head(3)

In [None]:
station_day.info()

# ARIMA Model

Arima model is a time series model . That means we can predcit how AQI change by time

In [None]:
arima_df = station_day[['Date','AQI']]

In [None]:
arima_df["Date"]= pd.to_datetime(arima_df["Date"])

In [None]:
arima_df.AQI = arima_df.groupby(pd.PeriodIndex(arima_df['Date'], freq="M"))['AQI'].apply(lambda x: x.fillna(x.mean()))

In [None]:
ts = arima_df.groupby(pd.PeriodIndex(arima_df['Date'], freq="M"))['AQI'].mean()

In [None]:
plt.figure(figsize = (20,10))
ts.plot()
plt.show()

**Doing adfuller test to identify the data set is stationay or not**

In [None]:
from statsmodels.tsa.stattools import adfuller
from numpy import log
result = adfuller(ts)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])

In [None]:
ts_train = ts[:50]
ts_test = ts[50:]

In [None]:
from statsmodels.tsa.arima.model import ARIMA

In [None]:
import itertools
from sklearn.metrics import mean_squared_error

In [None]:
p = range(0,8)
q = range(0,8)
d = range(0,2)

In [None]:
pqd_combination = list(itertools.product(p,d,q))

In [None]:
error = []
pqd = []

In [None]:
for i in pqd_combination:
    A_model = ARIMA(ts_train,order= i).fit()
    predict = A_model.predict(len(ts_train),len(ts)-1)
    e = np.sqrt(mean_squared_error(ts_test,predict))
    pqd.append(i)
    error.append(e)

In [None]:
min = error[0]
index = 0
for i in range(1,len(error)-1):
    if(min > error[i]):
        min = error[i]
        index = i
        
print(error[index],' => ',pqd[index])

In [None]:
model_ts = ARIMA(ts_train, order=pqd[index])
model_ts_fit = model_ts.fit()
print(model_ts_fit.summary())

In [None]:
arima_predict = model_ts_fit.predict(start = len(ts_train),end = len(ts))

In [None]:
plt.figure(figsize = (20,10))
ts_test.plot(label = "Test")
ts_train.plot(label = "Train")
predict.plot(label = 'Predict')
plt.legend()
plt.show()

# SARIMA Model 

In [None]:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(ts, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print (dfoutput )

*****This time series is not stationary. Because P > 0.05 .So we want to take first differntioal for series*****

In [None]:
df1_ts = ts - ts.shift(1)

In [None]:
df1_ts=df1_ts.dropna()

In [None]:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(df1_ts, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print (dfoutput )

In [None]:
plt.figure(figsize = (20,10))
df1_ts.plot()
plt.show()

In [None]:
df1_ts_train = df1_ts[:50]
df1_ts_test = df1_ts[50:]

In [None]:
df_pdq = []
df_error = []

In [None]:
for i in pqd_combination:
    A_model = ARIMA(df1_ts_train,order= i).fit()
    predict = A_model.predict(len(df1_ts_train),len(df1_ts)-1)
    e = np.sqrt(mean_squared_error(df1_ts_test,predict))
    df_pdq.append(i)
    df_error.append(e)

In [None]:
min = df_error[0]
index = 0
for i in range(1,len(df_error)-1):
    if(min > df_error[i]):
        min = df_error[i]
        index = i
        
print(df_error[index],' => ',df_pdq[index])

In [None]:
import statsmodels.api as sm

In [None]:
sarima_model = sm.tsa.statespace.SARIMAX(df1_ts_train, trend='n', order=(7,0,3), seasonal_order=(1,1,1,12))
s_results = sarima_model.fit()
print(s_results.summary())

In [None]:
s_pred = s_results.predict(start = len(df1_ts_train),end = len(df1_ts)-1)

In [None]:
plt.figure(figsize = (20,10))
df1_ts_test.plot(label = "Test")
df1_ts_train.plot(label = "Train")
s_pred.plot(label = 'Predict')
plt.legend()
plt.show()

#  FB Prophet

FB prophet is using for time series forcasting . So we want to use date column and AQI bucket column only

In [None]:
from prophet import Prophet

In [None]:
!pip install --upgrade plotly

In [None]:
alt_fb_df = station_day[['Date','AQI']]

In [None]:
alt_fb_df["Date"]= pd.to_datetime(alt_fb_df["Date"])

In [None]:
alt_fb_df.AQI = alt_fb_df.groupby(pd.PeriodIndex(alt_fb_df['Date'], freq="M"))['AQI'].apply(lambda x: x.fillna(x.mean()))

In [None]:
alt_fb_df = alt_fb_df.rename(columns={'Date':'ds','AQI':'y'})

In [None]:
alt_fb_df_train = alt_fb_df[:-26108]
alt_fb_df_test = alt_fb_df[len(alt_fb_df)-26108:]

In [None]:
alt_fbp = Prophet(daily_seasonality = True)
alt_fbp.fit(alt_fb_df_train)



In [None]:
alt_future = alt_fbp.make_future_dataframe(periods=365)

In [None]:
alt_forecast = alt_fbp.predict(alt_future)

In [None]:
plt.figure(figsize = (20,10))
plt.plot(alt_fb_df_train['ds'],alt_fb_df_train['y'],label = "Train")
plt.plot(alt_fb_df_test['ds'],alt_fb_df_test['y'],label = "Test")
plt.plot(alt_forecast['ds'],alt_forecast['yhat'],label = 'Predict')
plt.legend()
plt.show()


#  LSTM 

In [None]:
lstm_df = station_day

In [None]:
lstm_df["Date"]= pd.to_datetime(lstm_df["Date"])

In [None]:
lstm_df.drop(['AQI_Bucket', 'StationId'], axis=1, inplace=True)

In [None]:
lstm_df=lstm_df.dropna()

In [None]:
lstm_df.info()

In [None]:
dates = lstm_df['Date']
lstm_df.drop('Date', axis=1,inplace=True)

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

col_names = []
for i in lstm_df.columns:
    col_names.append(i)

lstm_df[col_names] = min_max_scaler.fit_transform(lstm_df[col_names])

In [None]:
x = lstm_df.drop(columns =['AQI'])
y = lstm_df['AQI']

In [None]:
x.describe()

In [None]:
dates = np.array(dates)
dates

In [None]:
x = np.array(x)
x = np.reshape(x, (x.shape[0], x.shape[1], 1))

In [None]:
y = np.array(y)
y = y.reshape(-1,)

In [None]:
dates_train, x_train, y_train = dates[:1500], x[:1500], y[:1500]
dates_test, x_test, y_test = dates[1500:], x[1500:], y[1500:]

In [None]:
x_train.shape

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import InputLayer

model_lstm = Sequential()

model_lstm.add(InputLayer((12,1)))

model_lstm.add(LSTM(50))

model_lstm.add(Dense(34 ,'relu'))
# model_lstm.add(Dropout(0.25))

model_lstm.add(Dense(15 ,'relu'))

model_lstm.add(Dense(1 ,'relu' ))

In [None]:
model_lstm.summary()

In [None]:
model_lstm.compile(
    optimizer = 'adam',
    loss = 'mse',
    metrics = ['accuracy']
)

In [None]:
lstm_history = model_lstm.fit(x_train,y_train,validation_split=0.33 , epochs = 10)

In [None]:
lstm_y_pred = model_lstm.predict(x_test)

In [None]:
plt.figure(figsize = (20,10))
plt.plot(dates_train[-100:],y_train[-100:],label = "Train")
plt.plot(dates_test[:100],y_test[:100],label = "Test")
plt.plot(dates_test[:100],lstm_y_pred[:100],label = 'Predict')
plt.legend()
plt.show()


#  GRU 

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import InputLayer

model_gru = Sequential()

model_gru.add(InputLayer((12,1)))

model_gru.add(GRU(50))

model_gru.add(Dense(26 ,'relu'))
# model_lstm.add(Dropout(0.25))

model_gru.add(Dense(10 ,'relu'))

model_gru.add(Dense(1 ))


In [None]:
model_gru.summary()

In [None]:
model_gru.compile(
    optimizer = 'adam',
    loss = 'mse',
    metrics = ['accuracy']
)

In [None]:
gru_history = model_gru.fit(x_train,y_train,validation_split=0.33 , epochs = 10)

In [None]:
model_gru.evaluate(x_test, y_test)

In [None]:
gru_y_pred = model_gru.predict(x_test)

In [None]:
plt.figure(figsize = (20,10))
plt.plot(dates_train[-100:],y_train[-100:],label = "Train")
plt.plot(dates_test[:100],y_test[:100],label = "Test")
plt.plot(dates_test[:100],gru_y_pred[:100],label = 'Predict')
plt.legend()
plt.show()
