## モデルの評価

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error # 回帰問題における性能評価に関する関数
from  sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix # 回帰問題における性能評価に関する関数

### 回帰問題
- 回帰問題で用いる指標は以下の3つ
    - MSE：平均二乗誤差
    - RMSE：平方根平均二乗誤差
    - MAE：平均絶対値誤差

In [2]:
# データ
df_house = pd.DataFrame({
        "Price":[24.8, 59.5, 7, 7.5, 9.8, 13.5, 14.9, 27, 27, 28, 28.5, 23, 12.9, 18, 23.7, 29.8, 17.8, 5.5, 8.7, 10.3, 14.5, 17.6, 16.8],
        "AreaSize":[98.4, 379.8, 58.6, 61.5, 99.6, 76.2, 115.7, 165.2, 215.2, 157.8, 212.9, 137.8, 87.2, 139.6, 172.6, 151.9, 179.5, 50, 105, 132, 174, 176, 168.7],
        "HouseSize":[74.2, 163.7, 50.5, 58, 66.4, 66.2, 59.6, 98.6, 87.4, 116.9, 96.9, 82.8, 75.1, 77.9, 125, 85.6, 70.1, 48.7, 66.5, 51.9, 82.3, 86.1, 80.8],
        "PassedYear":[4.8, 9.3, 13, 12.8, 14, 6, 14.7, 13.6, 13.3, 6.7, 3.1, 10.3, 11.6, 10.5, 3.8, 5.4, 4.5, 14.6, 13.7, 13, 10.3, 4.4, 12.8],
        "Train":[5, 12, 16, 16, 16, 16, 16, 16, 16, 16, 16, 19, 23, 23, 23, 28, 32, 37, 37, 37, 37, 37, 41],
        "Walk":[6, 12, 2, 1, 5, 1, 4, 2, 7, 6, 5, 20, 8, 3, 5, 4, 2, 3, 11, 6, 18, 10, 2]
    })
df_house.index.name="id"

In [3]:
# 線形回帰
y = df_house["Price"].values
X = df_house[["AreaSize", "Train"]].values
regr = LinearRegression(fit_intercept=True)
regr.fit(X, y)

In [4]:
# 値を予測
y_pred = regr.predict(X)

# MSEを計算
mse = mean_squared_error(y, y_pred) 
print("MSE = %s"%round(mse,3) )  

# MAEを計算
mae = mean_absolute_error(y, y_pred) 
print("MAE = %s"%round(mae,3) )

# RMSEを計算
rmse = np.sqrt(mse)
print("RMSE = %s"%round(rmse, 3) )

MSE = 14.885
MAE = 3.057
RMSE = 3.858


### 分類問題
- 分類問題で用いる指標
  - Accuracy
  - Precision
  - Recall
  - F1-score

In [5]:
# Priceの値を2000万以上なら1，そうでなければ0に変更
df_house['Price'] = df_house['Price'] >= 20

In [6]:
# ロジスティック回帰
y = df_house["Price"].values
X = df_house[["AreaSize", "Train"]].values
clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
clf.fit(X, y)



In [7]:
# ラベルを予測
y_pred = clf.predict(X)

# 正答率を計算
accuracy =  accuracy_score(y, y_pred)
print('正答率（Accuracy） = {:.3f}%'.format(100 * accuracy))

# Precision, Recall, F1-scoreを計算
precision, recall, f1_score, _ = precision_recall_fscore_support(y, y_pred)

# カテゴリ「2000万以上」に関するPrecision, Recall, F1-scoreを表示
print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))

正答率（Accuracy） = 78.261%
適合率（Precision） = 100.000%
再現率（Recall） = 44.444%
F1値（F1-score） = 61.538%


In [8]:
# 予測値と正解のクロス集計（混同行列）
conf_mat = confusion_matrix(y, y_pred)
conf_mat = pd.DataFrame(conf_mat, 
                        index=['正解 = 2000万未満', '正解 = 2000万以上'], 
                        columns=['予測 = 2000万未満', '予測 = 2000万以上'])
conf_mat

Unnamed: 0,予測 = 2000万未満,予測 = 2000万以上
正解 = 2000万未満,14,0
正解 = 2000万以上,5,4


In [9]:
conf_mat = confusion_matrix(y.astype(int), y_pred.astype(int))
conf_mat = pd.DataFrame(conf_mat, 
                        index=['正解 = 2000万未満', '正解 = 2000万以上'], 
                        columns=['予測 = 2000万未満', '予測 = 2000万以上'])
conf_mat


Unnamed: 0,予測 = 2000万未満,予測 = 2000万以上
正解 = 2000万未満,14,0
正解 = 2000万以上,5,4
