In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import sys
import os

sys.path.append(os.path.join(os.getcwd(), '../..'))

%load_ext autoreload
%autoreload 2

# データの用意

In [3]:
from sklearn.datasets import load_boston

data = load_boston()

In [4]:
説明文 = data['DESCR']
print(説明文)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [15]:
dataset_df = pd.DataFrame(data['data'], columns=data['feature_names'])
dataset_df['PRICE'] = data['target']
dataset_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


# 特徴量へ変換

In [16]:
from domain.model.feature import FeatureDataFrame, FeatureFactory


class FeatureFactoryImpl(FeatureFactory):
    
    @staticmethod
    def make(dataset_df: pd.DataFrame) -> FeatureDataFrame:
        dataset_df.drop(['PRICE'], axis=1, inplace=True)
        return FeatureDataFrame.of(dataset_df)

In [17]:
feature_df = FeatureFactoryImpl.make(dataset_df.copy())
feature_df.head()

Unnamed: 0_level_0,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
0,0.006,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.027,0.0,7.07,0.0,0.469,6.421,78.9,4.967,2.0,242.0,17.8,396.9,9.14
2,0.027,0.0,7.07,0.0,0.469,7.185,61.1,4.967,2.0,242.0,17.8,392.83,4.03
3,0.032,0.0,2.18,0.0,0.458,6.998,45.8,6.062,3.0,222.0,18.7,394.63,2.94
4,0.069,0.0,2.18,0.0,0.458,7.147,54.2,6.062,3.0,222.0,18.7,396.9,5.33


In [22]:
from domain.model.teacher import RegressionTeacherSeries

teacher_ser = RegressionTeacherSeries(dataset_df['PRICE'], dataset_df.index)
teacher_ser.head()

index
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: RegressionTeacher, dtype: float64

# 学習

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


X_train, X_test, Y_train, Y_test = train_test_split(feature_df.features, teacher_ser, test_size=0.3, random_state=1)

model = LinearRegression(fit_intercept=True, normalize=True)
model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

# 評価

In [28]:
r2 = model.score(X_test, Y_test)
print("決定係数 : ", np.round(r2, 3))

決定係数 :  0.784


---
# モデルの有意性検定

In [32]:
model.coef_

array([-9.71284334e-02,  6.07284394e-02,  5.96370092e-02,  2.44352809e+00,
       -2.14995617e+01,  2.78993012e+00,  3.66229013e-03, -1.51568850e+00,
        3.06819943e-01, -1.12697744e-02, -1.00652372e+00,  6.56942407e-03,
       -5.69755469e-01])

# 回帰係数の有意性検定