# Chap.12 さまざまな教師あり学習：分類

## 12.1 ロジスティック回帰

In [2]:
# 12-1 分類木の予測結果を確率として出力する

import pandas as pd
from sklearn import tree

df = pd.read_csv('../support/datafiles/KvsT.csv')

x = df.loc[:, '体重':'年代']
t = df['派閥']

model = tree.DecisionTreeClassifier(
    max_depth=1, random_state=0
)
model.fit(x, t)

# 予測用未知データ
data = [[65, 20]]

# 予測派閥
print(model.predict(data))

# 派閥の確率
print(model.predict_proba(data))


['きのこ']
[[0.6 0.4]]




### 12.1.1 ロジスティック回帰の実装

In [3]:
# 12-2 データの読み込み

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../support/datafiles/iris.csv')
df.head()

Unnamed: 0,がく片長さ,がく片幅,花弁長さ,花弁幅,種類
0,0.22,0.63,0.08,0.04,Iris-setosa
1,0.17,0.42,0.35,0.04,Iris-setosa
2,0.11,0.5,0.13,0.04,Iris-setosa
3,0.08,0.46,0.26,0.04,Iris-setosa
4,0.19,0.67,0.44,0.04,Iris-setosa


In [4]:
# 12-3 欠損値を平均値で穴埋めする
df_mean = df.mean()
train2 = df.fillna(df_mean)

# 特徴量と正解データに分割する
x = train2.loc[:, :'花弁幅']
t = train2['種類']

# 特徴量の標準化
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
new = sc.fit_transform(x)

  df_mean = df.mean()


In [7]:
# 12-4 訓練データと検証データに分割する
x_train, x_val, y_train, y_val = train_test_split(
    new, t, test_size=0.2, random_state=0
)

In [8]:
# 12-5 ロジスティック回帰による学習
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    C=0.1, random_state=0, multi_class='auto', solver='lbfgs'
)

In [9]:
# 12-6 正解率を確認する
model.fit(x_train, y_train)

print(model.score(x_train, y_train))
print(model.score(x_val, y_val))

0.8666666666666667
0.8333333333333334


In [10]:
# 12-7 係数を確認する
model.coef_

array([[-0.53209541,  0.48584036, -0.52629135, -0.83192326],
       [ 0.09494378, -0.44720771, -0.00110969, -0.04413366],
       [ 0.43715163, -0.03863265,  0.52740105,  0.87605692]])

In [13]:
# 切片を確認する
model.intercept_

array([-0.34434407,  0.48688238, -0.14253831])

In [11]:
# 12-8 新規データで予測する

# 新規データ
x_new = [[1, 2, 3, 4]]
# 新規データで予測
model.predict(x_new)

array(['Iris-virginica'], dtype=object)

In [12]:
# 12-9 確率の予測結果を確認する
model.predict_proba(x_new)

array([[4.03394997e-05, 3.02965489e-03, 9.96930006e-01]])

## 12.2 ランダムフォレスト

### 12.2.1 ランダムフォレストの実装

In [None]:
# 12-10 ライブラリの準備
import pandas as pd
from sklearn.model_selection import train_test_split
%matplotlib inline

In [14]:
# 12-11 Survived.csvを読み込む
df = pd.read_csv('../support/datafiles/Survived.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [17]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [18]:
# 12-12 欠損値を穴埋めする

# df['Pclass'] == 1
jo1 = df['Pclass'] == 1
jo2 = df['Survived'] == 0
jo3 = df['Age'].isnull()
df.loc[(jo1) & (jo2) & (jo3), 'Age'] = 43

jo2 = df['Survived'] == 1
df.loc[(jo1) & (jo2) & (jo3), 'Age'] = 35

# df['Pclass'] == 2
jo1 = df['Pclass'] == 2
jo2 = df['Survived'] == 0
jo3 = df['Age'].isnull()
df.loc[(jo1) & (jo2) & (jo3), 'Age'] = 26

jo2 = df['Survived'] == 1
df.loc[(jo1) & (jo2) & (jo3), 'Age'] = 20

# df['Pclass'] == 3
jo1 = df['Pclass'] == 3
jo2 = df['Survived'] == 0
jo3 = df['Age'].isnull()
df.loc[(jo1) & (jo2) & (jo3), 'Age'] = 43

jo2 = df['Survived'] == 1
df.loc[(jo1) & (jo2) & (jo3), 'Age'] = 35

In [19]:
# 12-13 文字列データの列を数値に変換する

# 特徴量として利用する列のリスト
col = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

x = df[col]
t = df['Survived']

# Sex列は文字の列なのでダミー変数化
dummy = pd.get_dummies(df['Sex'], drop_first=True)
x = pd.concat([x, dummy], axis=1)
x.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


In [20]:
# 12-14 ランダムフォレスト

# ランダムフォレストのインポート
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(
    x, t, test_size=0.2, random_state=0
)
model = RandomForestClassifier(
    n_estimators=200, random_state=0
)

In [21]:
# 12-15 モデルの学習
model.fit(x_train, y_train)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

0.9887640449438202
0.8715083798882681


In [22]:
# 12-16 単純な決定木分類と比較する
from sklearn import tree

model2 = tree.DecisionTreeClassifier(random_state=0)
model2.fit(x_train, y_train)

print(model2.score(x_train, y_train))
print(model2.score(x_test, y_test))

0.9887640449438202
0.8156424581005587


In [24]:
# 12-17 特徴量の重要度を確認する

# 特徴量重要度
importance = model.feature_importances_
importance2 = model2.feature_importances_
# 列との対応がわかりやすいようにシリーズ変換
importance = pd.Series(importance, index=x_train.columns)
importance2 = pd.Series(importance2, index=x_train.columns)

pd.concat([importance, importance2], axis=1)

Unnamed: 0,0,1
Pclass,0.079546,0.095271
Age,0.323012,0.298998
SibSp,0.045682,0.065202
Parch,0.032854,0.02356
Fare,0.265573,0.215564
male,0.253334,0.301405


## 12.3 アダブースト

### 12.3.1 バギングとブースティング

### 12.3.2 アダブーストの概要

### 12.3.3 アダブーストの実装

In [25]:
# 12-18 アダブーストを実装する

# アダブーストのインポート
from sklearn.ensemble import AdaBoostClassifier
# ベースとなるモデル
from sklearn.tree import DecisionTreeClassifier

x_train, x_test, y_train, y_test = train_test_split(
    x, t, test_size=0.2, random_state=0
)

# 最大の深さ5の決定木を何個も作っていく
base_model = DecisionTreeClassifier(
    max_depth=5, random_state=0
)
# 決定木を500個作成
model = AdaBoostClassifier(
    n_estimators=500, random_state=0, base_estimator=base_model
)
# 学習
model.fit(x_train, y_train)

# 訓練データの正解率とテストデータの正解率
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))


0.9887640449438202
0.8547486033519553


### 12.3.4 ランダムフォレストやアダブーストで回帰

In [26]:
# 12-19 ランダムフォレストで回帰モデルを作る

from sklearn.ensemble import RandomForestRegressor

# データの読み込み
df = pd.read_csv('../support/datafiles/cinema.csv')
df = df.fillna(df.mean())

x = df.loc[:, 'SNS1':'original']
t = df['sales']

x_train, x_test, y_train, y_test = train_test_split(
    x, t, test_size=0.2, random_state=0
)

# ランダムフォレスト回帰
# 100個のモデルで並列学習
model = RandomForestRegressor(
    n_estimators=100, random_state=0
)

model.fit(x_train, y_train)
# 決定係数
model.score(x_test, y_test)


0.5563347234627347

In [27]:
# 12-20 アダブーストで回帰モデルを作る

# アダブースト回帰
from sklearn.ensemble import AdaBoostRegressor
# ベースモデルとしての回帰木
from sklearn.tree import DecisionTreeRegressor

base = DecisionTreeRegressor(
    max_depth=3, random_state=0 
)
# 100個のモデルで逐次学習
model = AdaBoostRegressor(
    n_estimators=100, random_state=0, base_estimator=base
)

model.fit(x_train, y_train)
# 決定係数
model.score(x_test, y_test)

0.6748482902800903