In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import svm
from sklearn import neural_network
from sklearn import neighbors
from sklearn import tree
from sklearn import ensemble
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_csv('data/CollectionDatasets/daily-min-temperatures.csv', encoding='utf-8')
df

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8
...,...,...
3645,1990-12-27,14.0
3646,1990-12-28,13.6
3647,1990-12-29,13.5
3648,1990-12-30,15.7


In [15]:
df['Year'] = df['Date'].apply(lambda x: int(x.split("-")[0]))
df['Month'] = df['Date'].apply(lambda x: float(x.split("-")[1]))
df['Day'] = df['Date'].apply(lambda x: float(x.split("-")[2]))

In [18]:
df

Unnamed: 0,Date,Temp,Year,Month,Day,Month_sin,Month_cos
0,1981-01-01,20.7,1981,1.0,1.0,5.000000e-01,0.866025
1,1981-01-02,17.9,1981,1.0,2.0,5.000000e-01,0.866025
2,1981-01-03,18.8,1981,1.0,3.0,5.000000e-01,0.866025
3,1981-01-04,14.6,1981,1.0,4.0,5.000000e-01,0.866025
4,1981-01-05,15.8,1981,1.0,5.0,5.000000e-01,0.866025
...,...,...,...,...,...,...,...
3645,1990-12-27,14.0,1990,12.0,27.0,-2.449294e-16,1.000000
3646,1990-12-28,13.6,1990,12.0,28.0,-2.449294e-16,1.000000
3647,1990-12-29,13.5,1990,12.0,29.0,-2.449294e-16,1.000000
3648,1990-12-30,15.7,1990,12.0,30.0,-2.449294e-16,1.000000


In [17]:
df['Month_sin'] = np.sin(2*np.pi * df['Month']/12)
df['Month_cos'] = np.cos(2*np.pi * df['Month']/12)

In [19]:
df_g = df.groupby(['Year','Month'])['Day'].count()

In [37]:
df_daysofMonth = df[['Year','Month']].apply(lambda x: df_g[(int(x[0]),int(x[1]))], axis=1)

In [38]:
df['Day_sin'] = np.sin(2*np.pi * df['Day']/df_daysofMonth)
df['Day_cos'] = np.cos(2*np.pi * df['Day']/df_daysofMonth)

In [5]:
df.isna().sum()

year                       0
per capita income (US$)    0
dtype: int64

In [40]:
df

Unnamed: 0,Date,Temp,Year,Month,Day,Month_sin,Month_cos,Day_sin,Day_cos
0,1981-01-01,20.7,1981,1.0,1.0,5.000000e-01,0.866025,2.012985e-01,0.979530
1,1981-01-02,17.9,1981,1.0,2.0,5.000000e-01,0.866025,3.943559e-01,0.918958
2,1981-01-03,18.8,1981,1.0,3.0,5.000000e-01,0.866025,5.712682e-01,0.820763
3,1981-01-04,14.6,1981,1.0,4.0,5.000000e-01,0.866025,7.247928e-01,0.688967
4,1981-01-05,15.8,1981,1.0,5.0,5.000000e-01,0.866025,8.486443e-01,0.528964
...,...,...,...,...,...,...,...,...,...
3645,1990-12-27,14.0,1990,12.0,27.0,-2.449294e-16,1.000000,-7.247928e-01,0.688967
3646,1990-12-28,13.6,1990,12.0,28.0,-2.449294e-16,1.000000,-5.712682e-01,0.820763
3647,1990-12-29,13.5,1990,12.0,29.0,-2.449294e-16,1.000000,-3.943559e-01,0.918958
3648,1990-12-30,15.7,1990,12.0,30.0,-2.449294e-16,1.000000,-2.012985e-01,0.979530


In [41]:
X = df.drop(['Date', 'Temp', 'Month', 'Day'], axis=1)
y = df['Temp']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
print(X_train.shape)
print(X_test.shape)

(2920, 5)
(730, 5)


In [43]:
models = [linear_model.LinearRegression(),
          neighbors.KNeighborsRegressor(),
          Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', linear_model.LinearRegression())]),
          Pipeline([('poly', PolynomialFeatures(degree=3)),('linear', linear_model.LinearRegression())]),
          Pipeline([('poly', PolynomialFeatures(degree=4)),('linear', linear_model.LinearRegression())]),
          tree.DecisionTreeRegressor(max_depth=3),
          ensemble.RandomForestRegressor(max_depth=3),
          ensemble.GradientBoostingRegressor(n_estimators=100)]

In [44]:
df_model_comp = pd.DataFrame()
for model in models:
    dic = dict()
    dic['モデル名'] = model.__class__.__name__
    model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    dic['決定係数(訓練)'] = metrics.r2_score(y_train, y_pred)
    dic['平均絶対誤差(訓練)'] = metrics.mean_absolute_error(y_train, y_pred)
    dic['平均二乗誤差(訓練)'] = metrics.mean_squared_error(y_train, y_pred)
    dic['二乗平均平方根誤差(訓練)'] = np.sqrt(metrics.mean_squared_error(y_train, y_pred))
    y_pred = model.predict(X_test)
    dic['決定係数(テスト)'] = metrics.r2_score(y_test, y_pred)
    dic['平均絶対誤差(テスト)'] = metrics.mean_absolute_error(y_test, y_pred)
    dic['平均二乗誤差(テスト)'] = metrics.mean_squared_error(y_test, y_pred)
    dic['二乗平均平方根誤差(テスト)'] = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    df_model_comp = df_model_comp.append(dic,ignore_index=True)
df_model_comp

Unnamed: 0,モデル名,決定係数(訓練),平均絶対誤差(訓練),平均二乗誤差(訓練),二乗平均平方根誤差(訓練),決定係数(テスト),平均絶対誤差(テスト),平均二乗誤差(テスト),二乗平均平方根誤差(テスト)
0,LinearRegression,0.518595,2.212065,7.881506,2.807402,0.563581,2.173478,7.588893,2.754795
1,KNeighborsRegressor,0.740256,1.611829,4.252497,2.062158,0.655243,1.926356,5.994994,2.448468
2,Pipeline,0.538232,2.170744,7.560008,2.749547,0.589733,2.139664,7.134136,2.67098
3,Pipeline,0.55331,2.134811,7.313158,2.704285,0.591617,2.121234,7.101381,2.664842
4,Pipeline,0.563585,2.109461,7.144925,2.672999,0.596574,2.110805,7.015189,2.64862
5,DecisionTreeRegressor,0.518653,2.215912,7.880549,2.807232,0.56741,2.181253,7.52232,2.742685
6,RandomForestRegressor,0.528274,2.188313,7.723032,2.779034,0.575504,2.158392,7.381569,2.716904
7,GradientBoostingRegressor,0.596235,2.02691,6.610392,2.571068,0.597977,2.122261,6.99078,2.644008


In [46]:
from sklearn.preprocessing import StandardScaler

scaling_columns = ['Year']
scaler = StandardScaler().fit(X_train[scaling_columns])

scaled_train = pd.DataFrame(scaler.transform(X_train[scaling_columns]), columns=scaling_columns, index=X_train.index)
X_train.update(scaled_train)

scaled_test = pd.DataFrame(scaler.transform(X_test[scaling_columns]), columns=scaling_columns, index=X_test.index)
X_test.update(scaled_test)

In [47]:
df_model_comp = pd.DataFrame()
for model in models:
    dic = dict()
    dic['モデル名'] = model.__class__.__name__
    model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    dic['決定係数(訓練)'] = metrics.r2_score(y_train, y_pred)
    dic['平均絶対誤差(訓練)'] = metrics.mean_absolute_error(y_train, y_pred)
    dic['平均二乗誤差(訓練)'] = metrics.mean_squared_error(y_train, y_pred)
    dic['二乗平均平方根誤差(訓練)'] = np.sqrt(metrics.mean_squared_error(y_train, y_pred))
    y_pred = model.predict(X_test)
    dic['決定係数(テスト)'] = metrics.r2_score(y_test, y_pred)
    dic['平均絶対誤差(テスト)'] = metrics.mean_absolute_error(y_test, y_pred)
    dic['平均二乗誤差(テスト)'] = metrics.mean_squared_error(y_test, y_pred)
    dic['二乗平均平方根誤差(テスト)'] = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    df_model_comp = df_model_comp.append(dic,ignore_index=True)
df_model_comp

Unnamed: 0,モデル名,決定係数(訓練),平均絶対誤差(訓練),平均二乗誤差(訓練),二乗平均平方根誤差(訓練),決定係数(テスト),平均絶対誤差(テスト),平均二乗誤差(テスト),二乗平均平方根誤差(テスト)
0,LinearRegression,0.518595,2.212065,7.881506,2.807402,0.563581,2.173478,7.588893,2.754795
1,KNeighborsRegressor,0.736977,1.642267,4.30618,2.075134,0.615483,2.049644,6.686381,2.585804
2,Pipeline,0.538063,2.170564,7.562778,2.750051,0.589315,2.140621,7.141405,2.672341
3,Pipeline,0.551873,2.144819,7.336675,2.70863,0.589472,2.13842,7.138685,2.671832
4,Pipeline,0.565527,2.117891,7.113135,2.667046,0.591332,2.141174,7.106343,2.665772
5,DecisionTreeRegressor,0.518653,2.215912,7.880549,2.807232,0.56741,2.181253,7.52232,2.742685
6,RandomForestRegressor,0.527618,2.189765,7.733771,2.780966,0.574637,2.160124,7.396653,2.719679
7,GradientBoostingRegressor,0.596235,2.02691,6.610392,2.571068,0.597977,2.122261,6.99078,2.644008
