# ７章 機械学習モデルを構築する１０本ノック



### ノック６１：フォルダ生成をして機械学習用データを読み込もう

In [None]:
import os
data_dir = '../../downloads/ml_100knock/chapter-7/data/'
input_dir = os.path.join(data_dir, '0_input')
output_dir = os.path.join(data_dir, '1_output')
os.makedirs(input_dir,exist_ok=True)
os.makedirs(output_dir,exist_ok=True)

In [None]:
import pandas as pd
ml_data_file = 'ml_base_data.csv'
ml_data = pd.read_csv(os.path.join(input_dir, ml_data_file))
ml_data.head(3)

### ノック６２：カテゴリカル変数の対応をしよう

In [None]:
category_data = pd.get_dummies(ml_data['store_name'], prefix='store' ,prefix_sep='_')
display(category_data.head(3))

In [None]:
del category_data['store_麻生店']
del ml_data['year_month']
del ml_data['store_name']
ml_data = pd.concat([ml_data, category_data],axis=1)
ml_data.columns

### ノック６３：学習データとテストデータを分割しよう

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ml_data, test_size=0.3, random_state=0)
print(f'Train：{len(train_data)}件/ Test:{len(test_data)}')
print(f'Weekday Train0：{len(train_data.loc[train_data["y_weekday"]==0])}件')
print(f'Weekday Train1：{len(train_data.loc[train_data["y_weekday"]==1])}件')
print(f'Weekday Test0：{len(test_data.loc[test_data["y_weekday"]==0])}件')
print(f'Weekday Test1：{len(test_data.loc[test_data["y_weekday"]==1])}件')

print(f'Weekend Train0：{len(train_data.loc[train_data["y_weekend"]==0])}件')
print(f'Weekend Train1：{len(train_data.loc[train_data["y_weekend"]==1])}件')
print(f'Weekend Test0：{len(test_data.loc[test_data["y_weekend"]==0])}件')
print(f'Weekend Test1：{len(test_data.loc[test_data["y_weekend"]==1])}件')

### ノック６４：１つのモデルを構築しよう

In [None]:
X_cols = list(train_data.columns)
X_cols.remove('y_weekday')
X_cols.remove('y_weekend')
target_y = 'y_weekday'
y_train = train_data[target_y]
X_train = train_data[X_cols]
y_test = test_data[target_y]
X_test = test_data[X_cols]
display(y_train.head(3))
display(X_train.head(3))

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)

### ノック６５：評価を実施してみよう

In [None]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
y_pred_test

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score,confusion_matrix
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)
recall_train = recall_score(y_train, y_pred_train)
recall_test = recall_score(y_test, y_pred_test)
precision_train = precision_score(y_train, y_pred_train)
precision_test = precision_score(y_test, y_pred_test)
print(f'【正解率】Train：{round(acc_train,2)} Test：{round(acc_test, 2)}')
print(f'【F値】Train：{round(f1_train,2)} Test：{round(f1_test, 2)}')
print(f'【再現率】Train：{round(recall_train,2)} Test：{round(recall_test, 2)}')
print(f'【適合率】Train：{round(precision_train,2)} Test：{round(precision_test, 2)}')

In [None]:
print(confusion_matrix(y_train, y_pred_train))
print(confusion_matrix(y_test, y_pred_test))

In [None]:
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred_train).ravel()
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
print(f'【混同行列】Train：{tn_train}, {fp_train}, {fn_train}, {tp_train}')
print(f'【混同行列】Test：{tn_test}, {fp_test}, {fn_test}, {tp_test}')

In [None]:
score_train = pd.DataFrame({'DataCategory':['train'],'acc':[acc_train],'f1':[f1_train], 
                            'recall':[recall_train],'precision':[precision_train],
                            'tp':[tp_train],'fn':[fn_train],'fp':[fp_train],'tn':[tn_train]})
score_test = pd.DataFrame({'DataCategory':['test'], 'acc':[acc_test],'f1':[f1_test], 
                            'recall':[recall_test],'precision':[precision_test],
                            'tp':[tp_test],'fn':[fn_test],'fp':[fp_test],'tn':[tn_test]})
score = pd.concat([score_train,score_test], ignore_index=True)
score

### ノック６６：モデルの重要度を確認してみよう

In [None]:
importance = pd.DataFrame({'cols':X_train.columns, 'importance':model.feature_importances_})
importance = importance.sort_values('importance', ascending=False)
importance.head(10)

### ノック６７：モデル構築から評価までを関数化しよう

In [None]:
def make_model_and_eval(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    f1_train = f1_score(y_train, y_pred_train)
    f1_test = f1_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred_train).ravel()
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
    score_train = pd.DataFrame({'DataCategory':['train'],'acc':[acc_train],'f1':[f1_train], 
                                'recall':[recall_train],'precision':[precision_train],
                                'tp':[tp_train],'fn':[fn_train],'fp':[fp_train],'tn':[tn_train]})
    score_test = pd.DataFrame({'DataCategory':['test'], 'acc':[acc_test],'f1':[f1_test], 
                                'recall':[recall_test],'precision':[precision_test],
                                'tp':[tp_test],'fn':[fn_test],'fp':[fp_test],'tn':[tn_test]})
    score = pd.concat([score_train,score_test], ignore_index=True)
    importance = pd.DataFrame({'cols':X_train.columns, 'importance':model.feature_importances_})
    importance = importance.sort_values('importance', ascending=False)
    cols = pd.DataFrame({'X_cols':X_train.columns})
    display(score)
    return score, importance, model, cols

In [None]:
model = DecisionTreeClassifier(random_state=0)
score, importance, model, cols = make_model_and_eval(model, X_train, X_test, y_train, y_test)

### ノック６８：モデルファイルや評価結果を出力しよう

In [None]:
import datetime
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
target_output_dir_name = 'results_' + now
target_output_dir = os.path.join(output_dir, target_output_dir_name)
os.makedirs(target_output_dir, exist_ok=True)
print(target_output_dir)

In [None]:
score_name = 'score.csv'
importance_name = 'importance.csv'
cols_name = 'X_cols.csv'
model_nema = 'model.pickle'
score_path = os.path.join(target_output_dir, score_name)
importance_path = os.path.join(target_output_dir, importance_name)
cols_path = os.path.join(target_output_dir, cols_name)
model_path = os.path.join(target_output_dir, model_nema)

score.to_csv(score_path, index=False)
importance.to_csv(importance_path, index=False)
cols.to_csv(cols_path, index=False)
import pickle
with open(model_path, mode='wb') as f:
    pickle.dump(model, f, protocol=2)

### ノック６９：アルゴリズムを拡張して多角的な評価を実施しよう

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {'tree': DecisionTreeClassifier(random_state=0), 
          'RandomForest':RandomForestClassifier(random_state=0),
          'GradientBoostingClassifier':GradientBoostingClassifier(random_state=0)}

now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
target_output_dir_name = 'results_' + now
target_output_dir = os.path.join(output_dir, target_output_dir_name)
os.makedirs(target_output_dir, exist_ok=True)
print(target_output_dir)

score_all = []
importance_all = []
for model_name, model in models.items():
    print(model_name)
    score, importance, model, cols = make_model_and_eval(model, X_train, X_test, y_train, y_test)
    score['model_name'] = model_name
    importance['model_name'] = model_name
    
    model_nema = f'model_{model_name}.pickle'
    model_path = os.path.join(target_output_dir, model_nema)
    with open(model_path, mode='wb') as f:
        pickle.dump(model, f, protocol=2)
    score_all.append(score)
    importance_all.append(importance)
score_all = pd.concat(score_all, ignore_index=True)
importance_all = pd.concat(importance_all, ignore_index=True)
cols = pd.DataFrame({'X_cols':X_train.columns})

score_name = 'score.csv'
importance_name = 'importance.csv'
cols_name = 'X_cols.csv'
score_path = os.path.join(target_output_dir, score_name)
importance_path = os.path.join(target_output_dir, importance_name)
cols_path = os.path.join(target_output_dir, cols_name)
score_all.to_csv(score_path, index=False)
importance_all.to_csv(importance_path, index=False)
cols.to_csv(cols_path, index=False)

### ノック７０：平日/休日モデルを一度で回せるようにしよう

In [None]:
X_cols = list(train_data.columns)
X_cols.remove('y_weekday')
X_cols.remove('y_weekend')
targets_y = ['y_weekday', 'y_weekend']

now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
target_output_dir_name = 'results_' + now
target_output_dir = os.path.join(output_dir, target_output_dir_name)
os.makedirs(target_output_dir,exist_ok=True)
print(target_output_dir)

score_all = []
importance_all = []

for target_y in targets_y:
    y_train = train_data[target_y]
    X_train = train_data[X_cols]
    y_test = test_data[target_y]
    X_test = test_data[X_cols]
    
    models = {'tree': DecisionTreeClassifier(random_state=0), 
              'RandomForest':RandomForestClassifier(random_state=0),
              'GradientBoosting':GradientBoostingClassifier(random_state=0)}

    for model_name, model in models.items():
        print(model_name)
        score, importance, model, cols = make_model_and_eval(model, X_train, X_test, y_train, y_test)
        score['model_name'] = model_name
        importance['model_name'] = model_name
        score['model_target'] = target_y
        importance['model_target'] = target_y

        model_nema = f'model_{target_y}_{model_name}.pickle'
        model_path = os.path.join(target_output_dir, model_nema)
        with open(model_path, mode='wb') as f:
            pickle.dump(model, f, protocol=2)
        score_all.append(score)
        importance_all.append(importance)
        
score_all = pd.concat(score_all, ignore_index=True)
importance_all = pd.concat(importance_all, ignore_index=True)
cols = pd.DataFrame({'X_cols':X_train.columns})

score_name = 'score.csv'
importance_name = 'importance.csv'
cols_name = 'X_cols.csv'
score_path = os.path.join(target_output_dir, score_name)
importance_path = os.path.join(target_output_dir, importance_name)
cols_path = os.path.join(target_output_dir, cols_name)
score_all.to_csv(score_path, index=False)
importance_all.to_csv(importance_path, index=False)
cols.to_csv(cols_path, index=False)

In [None]:
score_all.loc[score_all['model_target']=='y_weekday']

In [None]:
score_all.loc[score_all['model_target']=='y_weekend']

In [None]:
importance_all.loc[(importance_all['model_target']=='y_weekday')&
                   (importance_all['model_name']=='GradientBoosting')].head(10)