In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score,mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV, cross_val_score,cross_validate, train_test_split
from sklearn.dummy import DummyRegressor

# Don't forget to change the working directory.

In [3]:
%cd /Users/datasandwich/Documents/AI
data = pd.read_csv('coursework1.csv')

/Users/datasandwich/Documents/AI


In [4]:
data.describe()

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,NOX
count,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0
mean,17.764381,1013.19924,79.124174,4.200294,25.419061,1083.79877,545.396183,134.188464,12.102353,68.190934
std,7.574323,6.41076,13.793439,0.760197,4.173916,16.527806,7.866803,15.829717,1.103196,10.470586
min,0.5223,985.85,30.344,2.0874,17.878,1000.8,512.45,100.17,9.9044,27.765
25%,11.408,1008.9,69.75,3.7239,23.294,1079.6,542.17,127.985,11.622,61.3035
50%,18.186,1012.8,82.266,4.1862,25.082,1088.7,549.89,133.78,12.025,66.601
75%,23.8625,1016.9,90.0435,4.5509,27.184,1096.0,550.06,140.895,12.578,73.9355
max,34.929,1034.2,100.2,7.6106,37.402,1100.8,550.61,174.61,15.081,119.89


In [None]:
sns.displot(data['NOX'],kind='kde')

# Baseline

In [5]:
X = data.drop('NOX',axis=1)
y = data['NOX']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = DummyRegressor()
clf.fit(X_train, y_train)
base_line = clf.score(X_test, y_test)
print("Baseline coefficient of determination: {}".format(base_line))

Baseline coefficient of determination: -7.300965809187154e-05


# Gradient boosting regression

In [6]:
X = data.drop('NOX',axis=1)
y = data['NOX']
parameters = {'learning_rate':[0.1,1], 'n_estimators':[1, 10],'max_depth':[3,13]}
reg = ensemble.GradientBoostingRegressor(**parameters)
clf_gb = GridSearchCV(reg, parameters,cv=5)
clf_gb.fit(X, y)

index=clf_gb.cv_results_['rank_test_score'][0]
n_estimators=clf_gb.cv_results_['param_n_estimators'][index-1]
max_depth=clf_gb.cv_results_['param_max_depth'][index-1]
learning_rate=clf_gb.cv_results_['param_learning_rate'][index-1]
fit_time_gb=clf_gb.cv_results_['mean_fit_time'][index-1]
print(
    "Most successful iteration: {}th, Optimal number of estimators: {}, Optimal tree depth: {}, Optimal learning rate: {}, Fit time: {}".format(
        index,
        n_estimators,
        max_depth,
        learning_rate,
        fit_time_gb)
    )

print("Refitting with optimal parameters!")

parameters = {'learning_rate':learning_rate, 'n_estimators':n_estimators,'max_depth':max_depth}
reg = ensemble.GradientBoostingRegressor(**parameters)
scores = cross_validate(reg, X, y, cv=5,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)

print("Negative mean squared error (5-fold average): {}".format(scores['test_neg_mean_squared_error'].mean()))

print("Training coefficient of determination (5-fold average): {}".format(scores['train_r2'].mean()))

scores = cross_validate(reg, X, y, cv=5,
                        scoring=('r2'),
                        return_train_score=False)

r2_GB=scores['test_score'].mean()

print("Testing coefficient of determination (5-fold average): {}".format(r2_GB))

print("COMPLETE")

Most successful iteration: 8th, Optimal number of estimators: 10, Optimal tree depth: 13, Optimal learning rate: 1, Fit time: 0.7769324779510498
Refitting with optimal parameters!
Negative mean squared error (5-fold average): -50.484149315188674
Training coefficient of determination (5-fold average): 0.9962596004708548
Testing coefficient of determination (5-fold average): 0.2586750861269104
COMPLETE


# Support vector regression

In [7]:
X = data.drop('NOX',axis=1)
y = data['NOX']
scaler=StandardScaler() 
scaler.fit_transform(X,y)
parameters = { 'C':[1, 10],'epsilon':[0.1,1]}
regr = SVR()
clf_svr = GridSearchCV(regr, parameters,cv=5)
clf_svr.fit(X, y)

index=clf_svr.cv_results_['rank_test_score'][0]
C=clf_svr.cv_results_['param_C'][index-1]
epsilon=clf_svr.cv_results_['param_epsilon'][index-1]
fit_time_svr=clf_svr.cv_results_['mean_fit_time'][index-1]

print(
    "Most successful iteration: {}th, Optimal C: {}, Optimal epsilon: {}, Fit time: {}".format(
    index,
    C,
    epsilon,
    fit_time_svr)
    )

print("Refitting with optimal parameters!")

parameters = { 'C':C,'epsilon':epsilon}
regr = SVR(**parameters)
scores = cross_validate(regr, X, y, cv=3,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)
print("Negative mean squared error (3-fold average): {}".format(scores['test_neg_mean_squared_error'].mean()))

print("Training coefficient of determination (3-fold average): {}".format(scores['train_r2'].mean()))

scores = cross_validate(regr, X, y, cv=3,
                        scoring=('r2'),
                        return_train_score=False)

r2_SVR=scores['test_score'].mean()

print("Testing coefficient of determination (3-fold average): {}".format(r2_SVR))

print("COMPLETE")

Most successful iteration: 4th, Optimal C: 10, Optimal epsilon: 1, Fit time: 2.683878707885742
Refitting with optimal parameters!
Negative mean squared error (3-fold average): -107.97068864338839
Training coefficient of determination (3-fold average): 0.2828652597114963
Testing coefficient of determination (3-fold average): -0.05363543626938031
COMPLETE


### Insights

In [8]:
fit_time_ratio=fit_time_svr/fit_time_gb
print("Gradient boosting is {} times faster than SVR".format(fit_time_ratio))

Gradient boosting is 3.4544555467210603 times faster than SVR


To save time, it is possible to use a linear kernel instead of an rbf kernel for the SVR. However, with the SVR model currently boasting a testing r^2 score of -0.05, this avenue is not worth exploring.

##### Test set score overview

In [9]:
print("GB: {}".format(r2_GB),
      "SVR: {}".format(r2_SVR),
      "Base line: {}".format(base_line)
     )

GB: 0.2586750861269104 SVR: -0.05363543626938031 Base line: -7.300965809187154e-05
