# Imports and Datasets

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.svm import SVR 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor

from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, explained_variance_score

import warnings

In [26]:
data = pd.read_csv("./2015_Air_quality_in_northern_Taiwan.csv")
#np.loadtxt("./2015_Air_quality_in_northern_Taiwan.csv", delimiter=',', skiprows=1, dtype=np.str)

  interactivity=interactivity, compiler=compiler, result=result)


stations = np.unique(data['station'])

for station in stations:
    plt.plot(np.arange(data[data.station==station]["NOx"].shape[0]), data[data.station==station]["NOx"].replace(to_replace=r'x|#|\*', value='', regex=True).astype(np.float), label=station)

plt.legend(loc=(1,0))
plt.show()

# Preprocessing

In [27]:
data.columns[np.any(data == "NR", axis=0)]
old_size = data.shape[0]

In [28]:
data = data.drop(["time","station"], 1)
print("dropped {} lines".format(old_size - data.shape[0]))
old_size = data.shape[0]

data = data.drop(data.columns[np.any(data == "NR", axis=0)],1)
print("dropped {} lines".format(old_size - data.shape[0]))
old_size = data.shape[0]

data = data[~data["NOx"].isnull()]
print("dropped {} lines".format(old_size - data.shape[0]))
old_size = data.shape[0]

data = data.replace(to_replace=r'x|#|\*', value='', regex=True).astype(np.float)
print("dropped {} lines".format(old_size - data.shape[0]))
old_size = data.shape[0]

for col in data.columns:
    fill_na = Imputer(strategy='mean', axis=1)
    data[col] = fill_na.fit_transform(data[col].values.reshape(1,-1)).reshape(-1,)
#data = data.fillna(value=0)
print("dropped {} lines".format(old_size - data.shape[0]))
old_size = data.shape[0]

dropped 0 lines
dropped 0 lines
dropped 1411 lines
dropped 0 lines
dropped 0 lines


In [29]:
data.columns

Index(['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10', 'RH',
       'SO2', 'THC', 'UVB', 'WD_HR', 'WIND_DIREC', 'WIND_SPEED', 'WS_HR'],
      dtype='object')

In [30]:
data.shape[0]

217229

# Data set

In [31]:
cut = 0.1

In [32]:
y = np.array(data['NOx'])
X = np.array(data.drop(['NOx','NO', 'NO2'], 1))

idx = np.arange(X.shape[0])
np.random.shuffle(idx)
test_idx = idx[:int(X.shape[0]*0.4)]
train_idx = idx[int(X.shape[0]*0.4):]

X_train = X[train_idx]
y_train = y[train_idx]

X_test = X[test_idx]
y_test = y[test_idx]

In [33]:
def mse(y_true, y_pred):
    return np.mean((y_true-y_pred) * (y_true-y_pred))

# Hyper param search

# Model cross-val

In [34]:
X_train

array([[  2.33098052e+01,   1.92872425e+00,   1.10000000e-01, ...,
          1.45398063e+02,   2.41644380e+00,   1.99169579e+00],
       [  2.10000000e+01,   1.92872425e+00,   3.00000000e-01, ...,
          1.92000000e+02,   8.00000000e-01,   3.00000000e-01],
       [  2.33098052e+01,   1.92872425e+00,   2.70000000e-01, ...,
          1.45398063e+02,   2.41644380e+00,   1.99169579e+00],
       ..., 
       [  2.80000000e+01,   2.00000000e+00,   1.07000000e+00, ...,
          7.40000000e+01,   1.00000000e+00,   2.00000000e-01],
       [  2.70000000e+01,   1.92872425e+00,   2.50000000e-01, ...,
          7.10000000e+01,   1.00000000e+00,   1.00000000e+00],
       [  3.00000000e+01,   1.92872425e+00,   3.60000000e-01, ...,
          2.81000000e+02,   1.40000000e+00,   7.00000000e-01]])

In [35]:
from sklearn.exceptions import ConvergenceWarning

models = [DecisionTreeRegressor, RandomForestRegressor, KNeighborsRegressor, MLPRegressor]#, SVR()]
model_param = [{}, {"n_jobs":4}, {"n_jobs":4}, {"hidden_layer_sizes":(200,100,50), "max_iter":10}, {}]
model_name = ["Decision Tree:", "Random Forest:", "KNN:\t", "Neural Net:", "SVM:\t"]
n_folds=5

for i in range(len(models)):
    regressor = models[i]
    name = model_name[i]
    param = model_param[i]
    
    mse_list = []
    r2_score_list = []
    explained_variance_score_list = []

    kf = KFold(n_splits=n_folds)
    for train, test in kf.split(X_train, y_train):
        model = regressor(**param)
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=ConvergenceWarning)
            model.fit(X_train[train], y_train[train])
            
        pred = model.predict(X_train[test])

        mse_list.append(mse(y_train[test],pred))
        r2_score_list.append(r2_score(y_train[test],pred))
        explained_variance_score_list.append(explained_variance_score(y_train[test],pred))

    print("{}\t MSE={:.2f}±{:.2f} \t r-squared={:.2f}±{:.2f} \t explained var={:.2f}±{:.2f}".format(name, 
                                                                               np.mean(mse_list), 
                                                                               np.std(mse_list),
                                                                               np.mean(r2_score_list), 
                                                                               np.std(r2_score_list),
                                                                               np.mean(explained_variance_score_list), 
                                                                               np.std(explained_variance_score_list)))


Decision Tree:	 MSE=127.93±3.27 	 r-squared=0.82±0.01 	 explained var=0.82±0.01
Random Forest:	 MSE=69.62±1.90 	 r-squared=0.90±0.00 	 explained var=0.90±0.00
KNN:		 MSE=222.81±3.20 	 r-squared=0.69±0.01 	 explained var=0.69±0.01
Neural Net:	 MSE=97.92±13.86 	 r-squared=0.86±0.02 	 explained var=0.87±0.01
