In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import r2_score 
import math
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Note that my data is cleaned
Pak_data = pd.read_csv("Pakuranga.csv")
Pak_data["date"] = pd.to_datetime(Pak_data["date"]).dt.strftime('%Y-%m-%d %H:%M:%S')

print('Number of rows and columns:', Pak_data.shape)
print('\n  \n**** First 5 instances: \n ')
print(Pak_data.head(5))
# Save it to CSV, the copy the tabular data to your report
Pak_data.head(5).to_csv("Pakuranga5.csv")

print('\n  \n**** Summary statistics for the numerical features: \n ')
print(Pak_data.describe().T)
# Save it to CSV, the copy the tabular data to your report
Pak_data.describe().T.to_csv("Pakuranga_data_summary.csv")

# Visualising the results
fig, ax = plt.subplots()

ax.plot(Pak_data["date"], Pak_data["Pak_PM10"], color='blue', label='PM_10')

ax.set_title('Pakuranga PM_10')
ax.set_xlabel('Date')
ax.set_ylabel('PM10 (ug/m3)')
ax.legend()

# set up figure size
fig.set_figheight(9)
fig.set_figwidth(16)

# format the date to YYYY-MM
x_ticks_length = np.arange(len(Pak_data['date']))
plt.xticks(x_ticks_length[::100],
           pd.to_datetime(Pak_data['date'][::100]).dt.strftime('%Y-%m-%d'),
           rotation=45)

"""
I am using 50 as NZ standard for 24 hour PM10 is 50 ug/m3.
Research about NZ hourly pm10/pm2.5
"""
ax.axhline(y=50, xmin=0, xmax=1200, color='r')

plt.show()

In [None]:
"""
Just some inspiration.
(Do a proper EDA (Exploratory data analysis )) !!
"""

Pak_data['year'] = pd.to_datetime(Pak_data['date']).dt.strftime('%y')
Pak_data['month'] = pd.to_datetime(Pak_data['date']).dt.strftime('%b')
Pak_data['DayofWeek'] = pd.to_datetime(Pak_data['date']).dt.strftime('%A')
Pak_data['hour'] = pd.to_datetime(Pak_data['date']).dt.strftime('%H')

fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1)
fig.set_figheight(12)
fig.set_figwidth(9)

sns.boxplot(x='year', y='Pak_PM10', data=Pak_data, ax=ax1)
sns.boxplot(x='month', y='Pak_PM10', data=Pak_data, ax=ax2)
sns.boxplot(x='DayofWeek', y='Pak_PM10', data=Pak_data, ax=ax3)
#sns.boxplot(x='hour', y='Pak_PM10', data=Pak_data, ax=ax4) I dont have hourly data but kept is as an example for your hourly data

plt.show()
Pak_data.to_csv("PakurangaWithTemporal.csv")

In [None]:
Pak_data.hist(stacked=True, bins=10)
#plt.subplots_adjust(hspace=0.5)
plt.show()

In [None]:
# plotting correlation heatmap
Pak_dataTemp = pd.read_csv("PakurangaWithTemporal.csv")
corrPak_dataTemp= Pak_dataTemp.corr()
plt.figure(figsize = (12,11)) 
dataplot = sb.heatmap(Pak_dataTemp.corr(), cmap="YlGnBu", annot=True)
plt.title('Correlation Matrix Heatmap2') 
plt.show()

In [None]:
#Outlier Treatment -remove entries outside of 1st & 3rd quantiles. Apply it with care and justify it.
Q1 = Pak_data.quantile(0.05) 
Q3 = Pak_data.quantile(0.95) 
IQR = Q3 - Q1 
pm = Pak_data[~((Pak_data < (Q1 - 1.5 * IQR)) |(Pak_data > (Q3 + 1.5 * IQR))).any(axis=1)] 

In [None]:

"""
In Our example dataset I  made the lag 1 only.
In Our example I am using ( training: 70%,  30% )

from sklearn.model_selection import TimeSeriesSplit

You may  want to use TimeSeriesSplit. It ensures  splitting  the data into windows of consecutive samples.
                                      It ensures that the validation/test results are more realistic, 
                                      being evaluated on the data collected after the model was trained.

"""
df=Pak_data.iloc[:, 1:9]
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n * 0.7)]  # 0 ~~ 70%
test_df = df[int(n * 0.7):]  # 70% ~~ end

#num_features = df.shape[1]

#Perfrom data prepration if required. Here I did not scale my data. 
#Also You might want to include hour, day, day of week or month too.

feature_cols = ['Temp',
                'Rain', 'RH',
                'WD','WS',
                'Solar',
                'Lag1']

value_col = ['Pak_PM10']

train_features = train_df[feature_cols].values
train_y = train_df[value_col].values

test_features = test_df[feature_cols].values
test_y = test_df[value_col].values

In [None]:
# let's say : first layer : 100 neron, second layer : 50 neron, third layer : 25 neron
model = MLPRegressor(hidden_layer_sizes=(100,50,25), max_iter=200,random_state=42)
model.fit(train_features, train_y)

# make predictions 
preds = model.predict(test_features)

#Compute evaluation scores 
mse = mean_squared_error(test_y,preds) 
rmse = math.sqrt(mse) 
mae = mean_absolute_error(test_y,preds) 
r2= r2_score(test_y,preds) 
n_iter=model.n_iter_
print("Root Mean Squared Error: ", rmse) 
print("Mean Absoloute Error: ", mae) 
print("R squared: ", r2)
print("Number of iterations: ", n_iter)

"""
This is a very simple model
refer to this tutorial
 
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html?highlight=mlpregressor

"""