In [1]:
#Prevent unnecessary warnings.
import warnings
warnings.filterwarnings("ignore")

#Introduce the basic package of data science.
import numpy       as np
import matplotlib  as mpl
import matplotlib.pyplot as plt
import pandas      as pd
import scipy.stats as st
import seaborn     as sns

#Introduce machine learning, preprocessing, model selection, and evaluation indicators.
from sklearn.preprocessing   import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics         import r2_score

#Import the Boston dataset used this time.
from sklearn.datasets     import load_boston

#Introduce regression algorithms.
from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression, ElasticNet
from sklearn.svm          import SVR
from sklearn.ensemble     import RandomForestRegressor, GradientBoostingRegressor
from xgboost              import XGBRegressor

In [2]:
#Load the Boston house price data set.
boston = load_boston()

#x features, and y labels.
x = boston.data
y = boston.target

#Display related attributes.
print('Feature column name')
print(boston.feature_names)
print("Sample data volume: %d, number of features: %d"% x.shape)
print("Target sample data volume: %d"% y.shape[0])


Feature column name
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Sample data volume: 506, number of features: 13
Target sample data volume: 506


In [3]:
x = pd.DataFrame(boston.data, columns=boston.feature_names)
#sns.distplot(tuple(y), kde=False, fit=st.norm)
#x

In [4]:
#Segment the data.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=28)

#Standardize the data set.
ss      = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test  = ss.transform(x_test)
#x_train[0:100]

In [5]:
#Set the model name.
names = ['LinerRegression',
          'Ridge',
          'Lasso',
          'Random Forrest',
          'GBDT',
          'Support Vector Regression',
          'ElasticNet',
          'XgBoost']


#Define the model.
# cv is the cross-validation idea here.
models = [LinearRegression(),
         RidgeCV(alphas=(0.001,0.1,1),cv=3),
         LassoCV(alphas=(0.001,0.1,1),cv=5),
         RandomForestRegressor(n_estimators=10),
         GradientBoostingRegressor(n_estimators=30),
         SVR(),
         ElasticNet(alpha=0.001,max_iter=10000),
         XGBRegressor()]
# Output the R2 scores of all regression models.

#Define the R2 scoring function.
def R2(model,x_train, x_test, y_train, y_test):
        model_fitted = model.fit(x_train,y_train)
        y_pred       = model_fitted.predict(x_test)
        score        = r2_score(y_test, y_pred)
        return score

#Traverse all models to score.
for name,model in zip(names,models):
        score = R2(model,x_train, x_test, y_train, y_test)
        print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))


LinerRegression: 0.564115, 0.0000
Ridge: 0.563673, 0.0000
Lasso: 0.564049, 0.0000
Random Forrest: 0.666344, 0.0000
GBDT: 0.738362, 0.0000
Support Vector Regression: 0.517260, 0.0000
ElasticNet: 0.563992, 0.0000
XgBoost: 0.761123, 0.0000
