In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Reading data

In [2]:
# Reading data
dummy = pd.read_csv('../data/dummy.csv')
dummy.head()

Unnamed: 0.1,Unnamed: 0,NO2,NO,NH3,slope0,slope1,slope2,slope3,slope4,slope5,...,avg40,avg41,avg42,avg43,avg44,avg45,avg46,avg47,avg48,avg49
0,mix0,50,100,25,0.000261,0.000347,0.000378,0.000342,0.000255,0.000254,...,1.788009e-07,-1e-06,-1.270353e-06,-1.920155e-06,-1.117044e-06,-6.934313e-07,8.882652e-08,4.68777e-07,2.236791e-07,-1.48427e-07
1,mix1,100,25,100,0.000244,0.000334,0.000371,0.000342,0.000256,0.000257,...,-6.433617e-07,2e-06,2.373382e-06,1.108367e-06,1.701933e-06,7.34507e-07,-3.755813e-07,-6.079087e-08,-4.27783e-07,4.461413e-07
2,mix2,25,100,50,0.000258,0.000343,0.000374,0.00034,0.000256,0.000254,...,-3.621577e-06,-3e-06,-9.317178e-07,7.781265e-07,-3.612217e-08,-9.233391e-07,-1.31073e-07,5.655704e-07,6.936261e-08,-1.512365e-07
3,mix3,50,25,100,0.000249,0.000341,0.000375,0.000341,0.000261,0.000261,...,2.51096e-06,3e-06,1.811447e-06,7.319652e-08,4.306713e-07,6.00013e-08,-3.678943e-07,4.973391e-08,-5.947913e-08,-2.831873e-06
4,mix4,100,100,25,0.000256,0.00034,0.000371,0.000332,0.000252,0.000251,...,3.064678e-06,3e-06,9.766817e-07,-7.697217e-07,-3.745022e-07,-4.271674e-07,4.307696e-08,-7.368087e-08,-2.749222e-07,-2.75593e-06


# Linear Regression

In [3]:
# Dependent Variables - gas concentrations
y = np.asarray(dummy.iloc[:, 1:4])

# Independent variables - shape features
X = np.asarray(dummy.iloc[:, 4:])

In [4]:
# Train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
steps = [('scaler', StandardScaler()),
         ('linreg', LinearRegression())]

from sklearn.pipeline import Pipeline

lr_pipe = Pipeline(steps)
lr_pipe.fit(X_train, y_train)
y_pred = lr_pipe.predict(X_test)

In [6]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared = False)

77.42353912076364

# Principal Components Regression - PCR

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

steps = [('scaler', StandardScaler()),
         ('PCA', PCA()),
         ('linreg', LinearRegression())]

params = [{'PCA__n_components':list(range(1,14))}]

scores = ['r2', 'neg_mean_absolute_error']

pcr_pipe = Pipeline(steps)

search = GridSearchCV(pcr_pipe, params, scoring = scores, n_jobs = -1, refit = 'neg_mean_absolute_error')
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)


y_pred = search.predict(X_test)
mean_squared_error(y_test, y_pred, squared = False)

{'PCA__n_components': 2}
-25.99983606209036


29.71710490312427

In [8]:
search.cv_results_

{'mean_fit_time': array([0.00411854, 0.00272102, 0.00246658, 0.00347152, 0.00342054,
        0.00214963, 0.00246806, 0.00224137, 0.00177698, 0.00169916,
        0.00153499, 0.00144558, 0.00168414]),
 'std_fit_time': array([1.48344309e-03, 2.32620080e-04, 4.44340895e-04, 2.05463340e-03,
        1.69836734e-03, 4.53831253e-05, 2.05423274e-04, 2.43447435e-04,
        2.04218305e-04, 1.84003137e-04, 1.02781074e-04, 1.41351891e-04,
        3.60038497e-04]),
 'mean_score_time': array([0.00135465, 0.00120473, 0.0011158 , 0.00161815, 0.00157986,
        0.00105753, 0.00117159, 0.00113301, 0.00085473, 0.00080762,
        0.00074363, 0.00073094, 0.00081186]),
 'std_score_time': array([2.34525627e-04, 1.13324943e-04, 6.10642066e-05, 1.05965047e-03,
        7.48482154e-04, 2.46892080e-05, 1.69667680e-04, 1.18286798e-04,
        1.56725646e-04, 1.08341411e-04, 1.11553910e-04, 1.07475901e-04,
        2.42237425e-04]),
 'param_PCA__n_components': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 

# Partial Least Squares Regression - PLSR

In [9]:
from sklearn.cross_decomposition import PLSRegression

steps = [('PLSR', PLSRegression())]

params = [{'PLSR__n_components':list(range(1,14))}]

scores = ['r2', 'neg_mean_absolute_error']

plsr_pipe = Pipeline(steps)

search = GridSearchCV(plsr_pipe, params, scoring = scores, n_jobs = -1, refit = 'neg_mean_absolute_error')
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)

y_pred = search.predict(X_test)
mean_squared_error(y_test, y_pred, squared = False)

{'PLSR__n_components': 2}
-25.367570152904737


28.111505285999964

In [10]:
search.cv_results_

{'mean_fit_time': array([0.00155435, 0.00208187, 0.00233078, 0.00344887, 0.00470395,
        0.00488658, 0.00583682, 0.00682616, 0.00789046, 0.00822105,
        0.00908585, 0.00985637, 0.01009388]),
 'std_fit_time': array([0.00020301, 0.00021921, 0.00033024, 0.00057765, 0.00029189,
        0.00047359, 0.00047067, 0.00039213, 0.00032242, 0.00056675,
        0.00050602, 0.00092421, 0.00114737]),
 'mean_score_time': array([0.00075722, 0.00069137, 0.00064077, 0.0007967 , 0.00080042,
        0.00076809, 0.00078092, 0.00081439, 0.0008378 , 0.00085692,
        0.00083137, 0.00084472, 0.00073357]),
 'std_score_time': array([9.83285163e-05, 9.86469253e-05, 9.43454100e-05, 1.11169406e-04,
        6.96424547e-05, 9.60234017e-05, 6.89557642e-05, 6.09188910e-05,
        8.34049394e-05, 2.96255208e-05, 6.06901506e-05, 7.80093058e-05,
        1.43296778e-04]),
 'param_PLSR__n_components': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
              mask=[False, False, False, False, Fa

In [11]:
# define model
model = DecisionTreeRegressor()
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

NameError: name 'DecisionTreeRegressor' is not defined