In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import svm
from sklearn import neural_network
from sklearn import neighbors
from sklearn import tree
from sklearn import ensemble
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_csv('data/CollectionDatasets/canada_per_capita.csv', encoding='utf-8')
df

Unnamed: 0,year,per capita income (US$)
0,1970,3399.299037
1,1971,3768.297935
2,1972,4251.175484
3,1973,4804.463248
4,1974,5576.514583
5,1975,5998.144346
6,1976,7062.131392
7,1977,7100.12617
8,1978,7247.967035
9,1979,7602.912681


In [3]:
df.dtypes

year                         int64
per capita income (US$)    float64
dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     47 non-null     int64  
 1   per capita income (US$)  47 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 880.0 bytes


In [5]:
df.isna().sum()

year                       0
per capita income (US$)    0
dtype: int64

In [6]:
X = df.drop('per capita income (US$)', axis=1)
y = df['per capita income (US$)']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
print(X_train.shape)
print(X_test.shape)

(37, 1)
(10, 1)


In [11]:
models = [linear_model.LinearRegression(),
          neighbors.KNeighborsRegressor(),
          Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', linear_model.LinearRegression())]),
          Pipeline([('poly', PolynomialFeatures(degree=3)),('linear', linear_model.LinearRegression())]),
          Pipeline([('poly', PolynomialFeatures(degree=4)),('linear', linear_model.LinearRegression())]),
          tree.DecisionTreeRegressor(max_depth=3),
          ensemble.RandomForestRegressor(max_depth=3),
          ensemble.GradientBoostingRegressor(n_estimators=100)]

In [12]:
df_model_comp = pd.DataFrame()
for model in models:
    dic = dict()
    dic['モデル名'] = model.__class__.__name__
    model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    dic['決定係数(訓練)'] = metrics.r2_score(y_train, y_pred)
    dic['平均絶対誤差(訓練)'] = metrics.mean_absolute_error(y_train, y_pred)
    dic['平均二乗誤差(訓練)'] = metrics.mean_squared_error(y_train, y_pred)
    dic['二乗平均平方根誤差(訓練)'] = np.sqrt(metrics.mean_squared_error(y_train, y_pred))
    y_pred = model.predict(X_test)
    dic['決定係数(テスト)'] = metrics.r2_score(y_test, y_pred)
    dic['平均絶対誤差(テスト)'] = metrics.mean_absolute_error(y_test, y_pred)
    dic['平均二乗誤差(テスト)'] = metrics.mean_squared_error(y_test, y_pred)
    dic['二乗平均平方根誤差(テスト)'] = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    df_model_comp = df_model_comp.append(dic,ignore_index=True)
df_model_comp

Unnamed: 0,モデル名,決定係数(訓練),平均絶対誤差(訓練),平均二乗誤差(訓練),二乗平均平方根誤差(訓練),決定係数(テスト),平均絶対誤差(テスト),平均二乗誤差(テスト),二乗平均平方根誤差(テスト)
0,LinearRegression,0.914463,2646.46311,12087410.0,3476.695022,0.800069,4657.751349,28552750.0,5343.477248
1,KNeighborsRegressor,0.968745,1283.262663,4416718.0,2101.598872,0.97179,1472.692615,4028759.0,2007.176823
2,Pipeline,0.935988,2105.894281,9045693.0,3007.605862,0.890062,3126.231287,15700580.0,3962.395557
3,Pipeline,0.93649,2174.880186,8974728.0,2995.785098,0.880772,3306.879728,17027360.0,4126.421644
4,Pipeline,0.936515,2175.874598,8971155.0,2995.188653,0.880567,3311.564528,17056540.0,4129.956508
5,DecisionTreeRegressor,0.978182,1240.013031,3083177.0,1755.897805,0.966301,1908.731976,4812594.0,2193.762457
6,RandomForestRegressor,0.987159,994.29208,1814534.0,1347.04654,0.980958,1169.140209,2719437.0,1649.071654
7,GradientBoostingRegressor,0.999996,14.408211,629.3695,25.087238,0.984672,1024.070658,2189103.0,1479.56176
