In [1]:
# 共通で利用するライブラリ
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from dateutil.relativedelta import relativedelta
import glob
import os
import datetime
import pickle
from natsort import natsorted
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

# warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, precision=3)
pd.options.display.float_format = "{:.4f}".format
pd.set_option("display.max_columns", None)
plt.style.use("seaborn")
plt.rcParams["font.size"] = 14
plt.rcParams["font.family"] = "IPAexGothic"
%matplotlib inline
%load_ext autoreload

# 追加するライブラリ


## Ch.10 機械学習システムのダッシュボードを作成する

- 店舗分析用のダッシュボードの追加
- 機械学習モデルの評価検証用ダッシュボードの追加

データ一覧

- store_monthly_data.csv
- ml_base_data.csv
- score.csv
- importance.csv
- report_pred_YYYYMM.xlsx

### Knock91: 単一データを読み込む

In [2]:
# 単一データの読み込み

data_dir = "data"
store_monthly_dir = os.path.join(data_dir, "01_store_monthly")
ml_base_dir = os.path.join(data_dir, "02_ml_base")
output_ml_result_dir = os.path.join(data_dir, "10_output_ml_result")
output_report_dir = os.path.join(data_dir, "11_output_report")

store_monthly_file = "store_monthly_data.csv"
ml_base_file = "ml_base_data.csv"

store_monthly_data = pd.read_csv(os.path.join(store_monthly_dir, store_monthly_file))
ml_base_data = pd.read_csv(os.path.join(ml_base_dir, ml_base_file))

display(store_monthly_data.head(3))
display(ml_base_data.head(3))

Unnamed: 0,store_name,order,order_fin,order_cancel,order_delivery,order_takeout,order_weekday,order_weekend,order_time_11,order_time_12,order_time_13,order_time_14,order_time_15,order_time_16,order_time_17,order_time_18,order_time_19,order_time_20,order_time_21,delta_avg,year_month
0,あきる野店,1147,945,202,841,306,844,303,91,122,112,101,95,107,106,100,108,109,96,34.1101,201904
1,さいたま南店,1504,1217,287,1105,399,1104,400,130,135,147,143,142,137,130,113,140,132,155,35.3377,201904
2,さいたま緑店,1028,847,181,756,272,756,272,95,91,106,95,102,82,90,93,95,95,84,34.2916,201904


Unnamed: 0,store_name,y_weekday,y_weekend,order,order_fin,order_cancel,order_delivery,order_takeout,order_weekday,order_weekend,order_time_11,order_time_12,order_time_13,order_time_14,order_time_15,order_time_16,order_time_17,order_time_18,order_time_19,order_time_20,order_time_21,delta_avg,year_month
0,あきる野店,1.0,0.0,1147,945,202,841,306,844,303,91,122,112,101,95,107,106,100,108,109,96,34.1101,201904
1,さいたま南店,1.0,1.0,1504,1217,287,1105,399,1104,400,130,135,147,143,142,137,130,113,140,132,155,35.3377,201904
2,さいたま緑店,1.0,1.0,1028,847,181,756,272,756,272,95,91,106,95,102,82,90,93,95,95,84,34.2916,201904


### Knock92: 更新データを読み込んで店舗別データを作成する

In [6]:
# 精度評価結果の読み込み

ml_results_dirs = os.listdir(output_ml_result_dir)
score_all = []
for ml_results_dir in ml_results_dirs:
    score_file_path = os.path.join(output_ml_result_dir, ml_results_dir, "score.csv")
    score_monthly = pd.read_csv(score_file_path)
    score_monthly["dirs"] = ml_results_dir
    score_all.append(score_monthly)
score_all = pd.concat(score_all, ignore_index=True)
display(score_all)


Unnamed: 0,DataCategory,accuracy,recall,precision,f1,tn,fp,fn,tp,model_name,model_target,dirs
0,train,1.0,1.0,1.0,1.0,777,0,0,861,tree,y_weekday,result_202004
1,test,0.812,0.8184,0.8399,0.829,250,61,71,320,tree,y_weekday,result_202004
2,train,1.0,1.0,1.0,1.0,777,0,0,861,RandomForest,y_weekday,result_202004
3,test,0.792,0.8286,0.804,0.8161,232,79,67,324,RandomForest,y_weekday,result_202004
4,train,0.8651,0.9001,0.8516,0.8752,642,135,86,775,GradientBoosting,y_weekday,result_202004
5,test,0.802,0.8363,0.8134,0.8247,236,75,64,327,GradientBoosting,y_weekday,result_202004
6,train,1.0,1.0,1.0,1.0,843,0,0,795,tree,y_weekend,result_202004
7,test,0.7179,0.6974,0.7224,0.7097,262,93,105,242,tree,y_weekend,result_202004
8,train,1.0,1.0,1.0,1.0,843,0,0,795,RandomForest,y_weekend,result_202004
9,test,0.8248,0.732,0.8944,0.8051,325,30,93,254,RandomForest,y_weekend,result_202004


In [9]:
# 年月の抽出

score_all.loc[:, "year_month"] = score_all["dirs"].str.split("_", expand=True)[1]
display(score_all.head())


Unnamed: 0,DataCategory,accuracy,recall,precision,f1,tn,fp,fn,tp,model_name,model_target,dirs,year_month
0,train,1.0,1.0,1.0,1.0,777,0,0,861,tree,y_weekday,result_202004,202004
1,test,0.812,0.8184,0.8399,0.829,250,61,71,320,tree,y_weekday,result_202004,202004
2,train,1.0,1.0,1.0,1.0,777,0,0,861,RandomForest,y_weekday,result_202004,202004
3,test,0.792,0.8286,0.804,0.8161,232,79,67,324,RandomForest,y_weekday,result_202004,202004
4,train,0.8651,0.9001,0.8516,0.8752,642,135,86,775,GradientBoosting,y_weekday,result_202004,202004


In [7]:
score_all["dirs"].str.split("_", expand=True)


Unnamed: 0,0,1
0,result,202004
1,result,202004
2,result,202004
3,result,202004
4,result,202004
5,result,202004
6,result,202004
7,result,202004
8,result,202004
9,result,202004
