In [None]:
# 
import numpy as np
from numpy import arange
from matplotlib import pyplot
import pandas as pd
from pandas import  set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
#
dataset = pd.read_csv('/Users/fuyang/Desktop/chapter2_all_data.csv', index_col=0)
dataset.head()

In [None]:
#
Y = dataset.Score
X = dataset.drop('Score', axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.3, random_state=7)

In [None]:
# 
num_folds = 5
scoring1 = 'neg_mean_squared_error'
scoring2 = 'r2'
scoring1 = 'mean_absolute_error'

In [None]:
# 
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
pipelines['ScalerDT'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])
pipelines['ScalerRF'] = Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestRegressor())])
pipelines['ScalerDL'] = Pipeline([('Scaler', StandardScaler()), ('DL', MLPRegressor())])

In [None]:
results = []
for key in pipelines:
    kfold = KFold(n_splits=5, shuffle=True)
    cv_result = cross_val_score(pipelines[key], X_train, Y_train, cv=kfold, scoring=scoring2)
    results.append(cv_result)
    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))

In [None]:
# improved algorithm - KNN
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
model1 = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model1, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('best：%s select%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))

In [None]:
# DT
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'min_samples_split':range(10, 200, 20), 'min_samples_leaf':range(1, 110, 10),
              'max_depth':range(3, 30, 2)}
model2 = DecisionTreeRegressor()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model2, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('best：%s select%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))

In [None]:
# SVM
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'kernel':['linear','poly','rbf','sigmoid']}
model3 = SVR()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model3, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('best：%s select%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))

In [None]:
# RF
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model4 = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, shuffle=True)
grid = GridSearchCV(estimator=model4, param_grid=param_grid, scoring=scoring2, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))