In [21]:
# 共通で利用するライブラリ
from natsort import natsorted
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

# warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, precision=3)
pd.options.display.float_format = "{:.1f}".format
pd.set_option("display.max_columns", None)
plt.style.use("seaborn")
plt.rcParams["font.size"] = 14
plt.rcParams["font.family"] = "IPAexGothic"
%matplotlib inline
%load_ext autoreload

# 追加するライブラリ
import os
import glob


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Ch.06 機械学習のためのデータ加工をする

- データ一覧

|No.|ファイル名|概要|
|--:|:--|:--|
|1|m_area.csv|地域マスタ。都道府県情報等。|
|2|m_store.csv|店舗マスタ。店舗名等。|
|3|tbl_order_201904.csv ~ <br> tbl_order_202003.csv|2019年度の注文データ。|

- 教師あり学習の分類（二値分類）モデル
  - 前月までのデータから、来月のオーダー数が増減を予測する

- フォルダ構成

```bash
ch06
├── ch06.ipynb
├── source
└── data
    ├── 0_input
    ├── 1_output
    └── 99_master
```

In [2]:
# フォルダ作成

data_dir = "data"
input_dir = os.path.join(data_dir, "0_input")
output_dir = os.path.join(data_dir, "1_output")
master_dir = os.path.join(data_dir, "99_master")

os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(master_dir, exist_ok=True)

In [13]:
# 注文データパスの取得

tbl_order_file = os.path.join(input_dir, "tbl_order_*.csv")
tbl_order_paths = glob.glob(tbl_order_file)
tbl_order_paths = natsorted(tbl_order_paths)
print(len(tbl_order_paths))
tbl_order_paths

12


['data/0_input/tbl_order_201904.csv',
 'data/0_input/tbl_order_201905.csv',
 'data/0_input/tbl_order_201906.csv',
 'data/0_input/tbl_order_201907.csv',
 'data/0_input/tbl_order_201908.csv',
 'data/0_input/tbl_order_201909.csv',
 'data/0_input/tbl_order_201910.csv',
 'data/0_input/tbl_order_201911.csv',
 'data/0_input/tbl_order_201912.csv',
 'data/0_input/tbl_order_202001.csv',
 'data/0_input/tbl_order_202002.csv',
 'data/0_input/tbl_order_202003.csv']

### Knock52: データの読み込みを行い、加工の方向性を検討する

In [7]:
# マスタデータの読み込み

m_area_file = "m_area.csv"
m_store_file = "m_store.csv"

m_area = pd.read_csv(os.path.join(master_dir, m_area_file))
m_store = pd.read_csv(os.path.join(master_dir, m_store_file))

display(m_area)
display(m_store)

Unnamed: 0,area_cd,wide_area,narrow_area
0,TK,東京,東京
1,KN,神奈川,神奈川
2,CH,千葉,千葉
3,SA,埼玉,埼玉
4,IB,北関東,茨城
5,TO,北関東,栃木
6,GU,北関東,群馬


Unnamed: 0,store_id,store_name,area_cd
0,1,昭島店,TK
1,2,あきる野店,TK
2,3,足立店,TK
3,4,北千住店,TK
4,5,綾瀬店,TK
...,...,...,...
192,193,桐生店,GU
193,194,高崎店,GU
194,195,館林店,GU
195,196,前橋店,GU


In [14]:
# 注文データの読み込み（1か月分）

tbl_order_path = tbl_order_paths[0]
print(f"読み込みデータ: {tbl_order_path}")

order_data = pd.read_csv(tbl_order_path)
print(f"データ件数: {len(order_data)}")
display(order_data)

読み込みデータ: data/0_input/tbl_order_201904.csv
データ件数: 233393


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2
...,...,...,...,...,...,...,...,...,...,...
233388,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2
233389,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2
233390,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9
233391,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1


### Knock53: 1か月分のデータの基本的なデータ加工を実施する

In [19]:
# 基本的なデータ加工

# 店舗No.999の除外
order_data = order_data.loc[order_data["store_id"] != 999]

# オーダーデータにマスタデータを結合
order_data = pd.merge(order_data, m_store, on="store_id", how="left")
order_data = pd.merge(order_data, m_area, on="area_cd", how="left")

# テイクアウト名の追加
order_data.loc[order_data["takeout_flag"] == 0, "takeout_name"] = "デリバリー"
order_data.loc[order_data["takeout_flag"] == 1, "takeout_name"] = "お持ち帰り"

# ステイタス名の追加
order_data.loc[order_data["status"] == 0, "status_name"] = "受付"
order_data.loc[order_data["status"] == 1, "status_name"] = "お支払済"
order_data.loc[order_data["status"] == 2, "status_name"] = "お渡し済"
order_data.loc[order_data["status"] == 9, "status_name"] = "キャンセル"

display(order_data)
order_data.dtypes

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name_x,area_cd_x,wide_area_x,narrow_area_x,takeout_name,status_name,store_name_y,area_cd_y,store_name,area_cd,wide_area_y,narrow_area_y
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1,杉並店,TK,東京,東京,お持ち帰り,お支払済,杉並店,TK,杉並店,TK,東京,東京
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,西多摩店,TK,西多摩店,TK,東京,東京
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,西多摩店,TK,西多摩店,TK,東京,東京
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1,板橋店,TK,東京,東京,お持ち帰り,お支払済,板橋店,TK,板橋店,TK,東京,東京
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2,国立店,TK,東京,東京,デリバリー,お渡し済,国立店,TK,国立店,TK,東京,東京
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232122,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2,池尻店,TK,東京,東京,デリバリー,お渡し済,池尻店,TK,池尻店,TK,東京,東京
232123,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2,新宿店,TK,東京,東京,デリバリー,お渡し済,新宿店,TK,新宿店,TK,東京,東京
232124,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9,八千代店,CH,千葉,千葉,お持ち帰り,キャンセル,八千代店,CH,八千代店,CH,千葉,千葉
232125,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1,新座店,SA,埼玉,埼玉,お持ち帰り,お支払済,新座店,SA,新座店,SA,埼玉,埼玉


order_id              int64
store_id              int64
customer_id          object
coupon_cd             int64
sales_detail_id       int64
order_accept_date    object
delivered_date       object
takeout_flag          int64
total_amount          int64
status                int64
store_name_x         object
area_cd_x            object
wide_area_x          object
narrow_area_x        object
takeout_name         object
status_name          object
store_name_y         object
area_cd_y            object
store_name           object
area_cd              object
wide_area_y          object
narrow_area_y        object
dtype: object

In [16]:
# 欠損値の確認

order_data.isna().sum()

order_id             0
store_id             0
customer_id          0
coupon_cd            0
sales_detail_id      0
order_accept_date    0
delivered_date       0
takeout_flag         0
total_amount         0
status               0
store_name           0
area_cd              0
wide_area            0
narrow_area          0
takeout_name         0
status_name          0
dtype: int64

### Knock54: 機械学習に使用する変数を作成する

In [17]:
def calc_delta(t):
    """経過時間を計算する"""
    t1, t2 = t
    delta = t2 - t1
    return delta.total_seconds() / 60

In [22]:
# ピザ提供時間までの時間を計算する
# delta項目の追加

order_data.loc[:, "order_accept_datetime"] = pd.to_datetime(order_data["order_accept_date"])
order_data.loc[:, "delivered_datetime"] = pd.to_datetime(order_data["delivered_date"])
order_data.loc[:, "delta"] = order_data[[
    "order_accept_datetime", "delivered_datetime"]].apply(calc_delta, axis=1)

display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name_x,area_cd_x,wide_area_x,narrow_area_x,takeout_name,status_name,store_name_y,area_cd_y,store_name,area_cd,wide_area_y,narrow_area_y,order_accept_datetime,delivered_datetime,delta
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1,杉並店,TK,東京,東京,お持ち帰り,お支払済,杉並店,TK,杉並店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:26:00,26.0
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,西多摩店,TK,西多摩店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:47:00,47.0
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,西多摩店,TK,西多摩店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:10:00,10.0
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1,板橋店,TK,東京,東京,お持ち帰り,お支払済,板橋店,TK,板橋店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:21:00,21.0
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2,国立店,TK,東京,東京,デリバリー,お渡し済,国立店,TK,国立店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:18:00,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232122,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2,池尻店,TK,東京,東京,デリバリー,お渡し済,池尻店,TK,池尻店,TK,東京,東京,2019-04-30 21:58:58,2019-04-30 22:15:58,17.0
232123,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2,新宿店,TK,東京,東京,デリバリー,お渡し済,新宿店,TK,新宿店,TK,東京,東京,2019-04-30 21:58:58,2019-04-30 22:40:58,42.0
232124,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9,八千代店,CH,千葉,千葉,お持ち帰り,キャンセル,八千代店,CH,八千代店,CH,千葉,千葉,2019-04-30 21:58:58,2019-04-30 22:13:58,15.0
232125,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1,新座店,SA,埼玉,埼玉,お持ち帰り,お支払済,新座店,SA,新座店,SA,埼玉,埼玉,2019-04-30 21:58:58,2019-04-30 22:52:58,54.0


In [23]:
# ピザが注文された時間帯、平日/休日を追加

order_data.loc[:, "order_accept_hour"] = order_data["order_accept_datetime"].dt.hour
order_data.loc[:, "order_accept_weekday"] = order_data["order_accept_datetime"].dt.weekday
order_data.loc[order_data["order_accept_weekday"] >= 5, "weekday_info"] = "休日"
order_data.loc[order_data["order_accept_weekday"] < 5, "weekday_info"] = "平日"

display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name_x,area_cd_x,wide_area_x,narrow_area_x,takeout_name,status_name,store_name_y,area_cd_y,store_name,area_cd,wide_area_y,narrow_area_y,order_accept_datetime,delivered_datetime,delta,order_accept_hour,order_accept_weekday,weekday_info
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1,杉並店,TK,東京,東京,お持ち帰り,お支払済,杉並店,TK,杉並店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:26:00,26.0,11,0,平日
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,西多摩店,TK,西多摩店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:47:00,47.0,11,0,平日
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,西多摩店,TK,西多摩店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:10:00,10.0,11,0,平日
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1,板橋店,TK,東京,東京,お持ち帰り,お支払済,板橋店,TK,板橋店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:21:00,21.0,11,0,平日
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2,国立店,TK,東京,東京,デリバリー,お渡し済,国立店,TK,国立店,TK,東京,東京,2019-04-01 11:00:00,2019-04-01 11:18:00,18.0,11,0,平日
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232122,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2,池尻店,TK,東京,東京,デリバリー,お渡し済,池尻店,TK,池尻店,TK,東京,東京,2019-04-30 21:58:58,2019-04-30 22:15:58,17.0,21,1,平日
232123,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2,新宿店,TK,東京,東京,デリバリー,お渡し済,新宿店,TK,新宿店,TK,東京,東京,2019-04-30 21:58:58,2019-04-30 22:40:58,42.0,21,1,平日
232124,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9,八千代店,CH,千葉,千葉,お持ち帰り,キャンセル,八千代店,CH,八千代店,CH,千葉,千葉,2019-04-30 21:58:58,2019-04-30 22:13:58,15.0,21,1,平日
232125,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1,新座店,SA,埼玉,埼玉,お持ち帰り,お支払済,新座店,SA,新座店,SA,埼玉,埼玉,2019-04-30 21:58:58,2019-04-30 22:52:58,54.0,21,1,平日
