In [1]:
# 
import numpy as np
from numpy import arange
from matplotlib import pyplot
import pandas as pd
from pandas import  set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

#
from mlxtend.regressor import StackingRegressor
from mlxtend.regressor import StackingCVRegressor

#
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

In [2]:
# Import Data
dataset = pd.read_csv("E:\\exercise\\202109-12\\kitchen waste.csv", encoding = 'gbk', index_col=0)
dataset.head()

Unnamed: 0,day,Temperature,MC,pH,Ammonia,Nitrate,TN,TOC,EC,OM,score
0,0,35.46,60.0,5.73,2515.0,325.49,1.81,48.37,32.88,1.78,22.42
1,3,41.87,56.88,7.13,3449.1,302.42,1.92,44.37,39.46,2.44,34.71
2,6,59.13,53.08,7.96,5197.6,243.4,1.84,37.6,30.55,3.01,37.68
3,9,53.12,49.42,8.18,1940.1,203.99,1.79,33.48,63.12,3.16,55.52
4,12,38.96,45.22,8.02,814.4,200.52,1.92,32.68,79.47,3.56,66.67


In [3]:
# Separate datasets
array = dataset.values
X = array[:, 0:10]
Y = array[:, 10]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.3, random_state=7)

In [4]:
# Evaluation means
num_folds = 5
scoring1 = 'neg_mean_squared_error'
scoring2 = 'r2'
scoring1 = 'mean_absolute_error'

In [5]:
# Data processing
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
pipelines['ScalerDT'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])
pipelines['ScalerRF'] = Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestRegressor())])
pipelines['ScalerDL'] = Pipeline([('Scaler', StandardScaler()), ('DL', MLPRegressor())])

In [6]:
# Model performance
results = []
for key in pipelines:
    kfold = KFold(n_splits=num_folds, shuffle=True)
    cv_result = cross_val_score(pipelines[key], X_train, Y_train, cv=kfold, scoring=scoring2)
    results.append(cv_result)
    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))

ScalerLR: 0.846190 (0.029676)
ScalerKNN: 0.846668 (0.018541)
ScalerDT: 0.835875 (0.054600)
ScalerSVM: 0.574575 (0.062273)
ScalerRF: 0.905730 (0.007396)




ScalerDL: -2.783695 (0.485281)




In [7]:
# Algorithm optimization - KNN
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
model1 = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model1, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('Best：%s Use%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))

Best：0.877893863543975 Use{'n_neighbors': 3}
0.867092 (0.035341) with {'n_neighbors': 1}
0.877894 (0.031072) with {'n_neighbors': 3}
0.840986 (0.037722) with {'n_neighbors': 5}
0.813244 (0.036953) with {'n_neighbors': 7}
0.793971 (0.032059) with {'n_neighbors': 9}
0.783521 (0.031837) with {'n_neighbors': 11}
0.765648 (0.034443) with {'n_neighbors': 13}
0.746793 (0.036532) with {'n_neighbors': 15}
0.731316 (0.035882) with {'n_neighbors': 17}
0.720385 (0.041106) with {'n_neighbors': 19}
0.716498 (0.046391) with {'n_neighbors': 21}


In [8]:
# Algorithm optimization - DT
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'min_samples_split':range(10, 200, 20), 'min_samples_leaf':range(1, 110, 10),'max_depth':range(3, 30, 2)}
model2 = DecisionTreeRegressor()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model2, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('Best：%s Use%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))

Best：0.8428719054937016 Use{'max_depth': 23, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.804946 (0.050012) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.803309 (0.052610) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 30}
0.779407 (0.045356) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 50}
0.756466 (0.053232) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 70}
0.712691 (0.038327) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 90}
0.611351 (0.108732) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 110}
0.576986 (0.078371) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 130}
0.576986 (0.078371) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 150}
0.576986 (0.078371) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 170}
0.576986 (0.078371) with {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 190}
0.

In [9]:
# Algorithm optimization - SVM
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'kernel':['linear','poly','rbf','sigmoid']}
model3 = SVR()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model3, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('Best：%s Use%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))

Best：0.8304244375447587 Use{'kernel': 'linear'}
0.830424 (0.027984) with {'kernel': 'linear'}
0.487479 (0.039399) with {'kernel': 'poly'}
0.582028 (0.039461) with {'kernel': 'rbf'}
0.707601 (0.050273) with {'kernel': 'sigmoid'}


In [13]:
# Algorithm optimization - RF
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300, 400, 500]}
model4 = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model4, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('Best：%s Use%s' % (grid_result.best_score_, grid_result.best_params_))

Best：0.9207321196446803 Use{'n_estimators': 100}
