In [60]:
# 共通で利用するライブラリ
from natsort import natsorted
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

# warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, precision=3)
pd.options.display.float_format = "{:.1f}".format
pd.set_option("display.max_columns", None)
plt.style.use("seaborn")
plt.rcParams["font.size"] = 14
plt.rcParams["font.family"] = "IPAexGothic"
%matplotlib inline
%load_ext autoreload

# 追加するライブラリ
import os
import glob
from dateutil.relativedelta import relativedelta


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Ch.06 機械学習のためのデータ加工をする

- データ一覧

|No.|ファイル名|概要|
|--:|:--|:--|
|1|m_area.csv|地域マスタ。都道府県情報等。|
|2|m_store.csv|店舗マスタ。店舗名等。|
|3|tbl_order_201904.csv ~ <br> tbl_order_202003.csv|2019年度の注文データ。|

- 教師あり学習の分類（二値分類）モデル
  - 前月までのデータから、来月のオーダー数が増減を予測する

- フォルダ構成

```bash
ch06
├── ch06.ipynb
├── source
└── data
    ├── 0_input
    ├── 1_output
    └── 99_master
```

In [35]:
# フォルダ作成

data_dir = "data"
input_dir = os.path.join(data_dir, "0_input")
output_dir = os.path.join(data_dir, "1_output")
master_dir = os.path.join(data_dir, "99_master")

os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(master_dir, exist_ok=True)

In [36]:
# 注文データパスの取得

tbl_order_file = os.path.join(input_dir, "tbl_order_*.csv")
tbl_order_paths = glob.glob(tbl_order_file)
tbl_order_paths = natsorted(tbl_order_paths)
print(len(tbl_order_paths))
tbl_order_paths

12


['data/0_input/tbl_order_201904.csv',
 'data/0_input/tbl_order_201905.csv',
 'data/0_input/tbl_order_201906.csv',
 'data/0_input/tbl_order_201907.csv',
 'data/0_input/tbl_order_201908.csv',
 'data/0_input/tbl_order_201909.csv',
 'data/0_input/tbl_order_201910.csv',
 'data/0_input/tbl_order_201911.csv',
 'data/0_input/tbl_order_201912.csv',
 'data/0_input/tbl_order_202001.csv',
 'data/0_input/tbl_order_202002.csv',
 'data/0_input/tbl_order_202003.csv']

### Knock52: データの読み込みを行い、加工の方向性を検討する

In [37]:
# マスタデータの読み込み

m_area_file = "m_area.csv"
m_store_file = "m_store.csv"

m_area = pd.read_csv(os.path.join(master_dir, m_area_file))
m_store = pd.read_csv(os.path.join(master_dir, m_store_file))

display(m_area)
display(m_store)

Unnamed: 0,area_cd,wide_area,narrow_area
0,TK,東京,東京
1,KN,神奈川,神奈川
2,CH,千葉,千葉
3,SA,埼玉,埼玉
4,IB,北関東,茨城
5,TO,北関東,栃木
6,GU,北関東,群馬


Unnamed: 0,store_id,store_name,area_cd
0,1,昭島店,TK
1,2,あきる野店,TK
2,3,足立店,TK
3,4,北千住店,TK
4,5,綾瀬店,TK
...,...,...,...
192,193,桐生店,GU
193,194,高崎店,GU
194,195,館林店,GU
195,196,前橋店,GU


In [38]:
# 注文データの読み込み（1か月分）

tbl_order_path = tbl_order_paths[0]
print(f"読み込みデータ: {tbl_order_path}")

order_data = pd.read_csv(tbl_order_path)
print(f"データ件数: {len(order_data)}")
display(order_data)

読み込みデータ: data/0_input/tbl_order_201904.csv
データ件数: 233393


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2
...,...,...,...,...,...,...,...,...,...,...
233388,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2
233389,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2
233390,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9
233391,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1


### Knock53: 1か月分のデータの基本的なデータ加工を実施する

In [39]:
# 基本的なデータ加工

# 店舗No.999の除外
order_data = order_data.loc[order_data["store_id"] != 999]

# オーダーデータにマスタデータを結合
order_data = pd.merge(order_data, m_store, on="store_id", how="left")
order_data = pd.merge(order_data, m_area, on="area_cd", how="left")

# テイクアウト名の追加
order_data.loc[order_data["takeout_flag"] == 0, "takeout_name"] = "デリバリー"
order_data.loc[order_data["takeout_flag"] == 1, "takeout_name"] = "お持ち帰り"

# ステイタス名の追加
order_data.loc[order_data["status"] == 0, "status_name"] = "受付"
order_data.loc[order_data["status"] == 1, "status_name"] = "お支払済"
order_data.loc[order_data["status"] == 2, "status_name"] = "お渡し済"
order_data.loc[order_data["status"] == 9, "status_name"] = "キャンセル"

display(order_data)
order_data.dtypes

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name,area_cd,wide_area,narrow_area,takeout_name,status_name
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1,杉並店,TK,東京,東京,お持ち帰り,お支払済
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2,西多摩店,TK,東京,東京,デリバリー,お渡し済
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2,西多摩店,TK,東京,東京,デリバリー,お渡し済
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1,板橋店,TK,東京,東京,お持ち帰り,お支払済
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2,国立店,TK,東京,東京,デリバリー,お渡し済
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232122,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2,池尻店,TK,東京,東京,デリバリー,お渡し済
232123,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2,新宿店,TK,東京,東京,デリバリー,お渡し済
232124,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9,八千代店,CH,千葉,千葉,お持ち帰り,キャンセル
232125,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1,新座店,SA,埼玉,埼玉,お持ち帰り,お支払済


order_id              int64
store_id              int64
customer_id          object
coupon_cd             int64
sales_detail_id       int64
order_accept_date    object
delivered_date       object
takeout_flag          int64
total_amount          int64
status                int64
store_name           object
area_cd              object
wide_area            object
narrow_area          object
takeout_name         object
status_name          object
dtype: object

In [40]:
# 欠損値の確認

order_data.isna().sum()

order_id             0
store_id             0
customer_id          0
coupon_cd            0
sales_detail_id      0
order_accept_date    0
delivered_date       0
takeout_flag         0
total_amount         0
status               0
store_name           0
area_cd              0
wide_area            0
narrow_area          0
takeout_name         0
status_name          0
dtype: int64

### Knock54: 機械学習に使用する変数を作成する

In [41]:
def calc_delta(t):
    """経過時間を計算する"""
    t1, t2 = t
    delta = t2 - t1
    return delta.total_seconds() / 60

In [42]:
# ピザ提供までの時間を計算する
# delta項目の追加

order_data.loc[:, "order_accept_datetime"] = pd.to_datetime(order_data["order_accept_date"])
order_data.loc[:, "delivered_datetime"] = pd.to_datetime(order_data["delivered_date"])
order_data.loc[:, "delta"] = order_data[[
    "order_accept_datetime", "delivered_datetime"]].apply(calc_delta, axis=1)

display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name,area_cd,wide_area,narrow_area,takeout_name,status_name,order_accept_datetime,delivered_datetime,delta
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1,杉並店,TK,東京,東京,お持ち帰り,お支払済,2019-04-01 11:00:00,2019-04-01 11:26:00,26.0
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,2019-04-01 11:00:00,2019-04-01 11:47:00,47.0
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,2019-04-01 11:00:00,2019-04-01 11:10:00,10.0
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1,板橋店,TK,東京,東京,お持ち帰り,お支払済,2019-04-01 11:00:00,2019-04-01 11:21:00,21.0
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2,国立店,TK,東京,東京,デリバリー,お渡し済,2019-04-01 11:00:00,2019-04-01 11:18:00,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232122,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2,池尻店,TK,東京,東京,デリバリー,お渡し済,2019-04-30 21:58:58,2019-04-30 22:15:58,17.0
232123,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2,新宿店,TK,東京,東京,デリバリー,お渡し済,2019-04-30 21:58:58,2019-04-30 22:40:58,42.0
232124,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9,八千代店,CH,千葉,千葉,お持ち帰り,キャンセル,2019-04-30 21:58:58,2019-04-30 22:13:58,15.0
232125,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1,新座店,SA,埼玉,埼玉,お持ち帰り,お支払済,2019-04-30 21:58:58,2019-04-30 22:52:58,54.0


In [43]:
# ピザが注文された時間帯、平日/休日を追加

order_data.loc[:, "order_accept_hour"] = order_data["order_accept_datetime"].dt.hour
order_data.loc[:, "order_accept_weekday"] = order_data["order_accept_datetime"].dt.weekday
order_data.loc[order_data["order_accept_weekday"] >= 5, "weekday_info"] = "休日"
order_data.loc[order_data["order_accept_weekday"] < 5, "weekday_info"] = "平日"

display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name,area_cd,wide_area,narrow_area,takeout_name,status_name,order_accept_datetime,delivered_datetime,delta,order_accept_hour,order_accept_weekday,weekday_info
0,22808272,39,C80973292,40,22222408,2019-04-01 11:00:00,2019-04-01 11:26:00,1,2112,1,杉並店,TK,東京,東京,お持ち帰り,お支払済,2019-04-01 11:00:00,2019-04-01 11:26:00,26.0,11,0,平日
1,10902625,63,C94948343,20,79467084,2019-04-01 11:00:00,2019-04-01 11:47:00,0,2154,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,2019-04-01 11:00:00,2019-04-01 11:47:00,47.0,11,0,平日
2,5990375,63,C91814442,1,61749935,2019-04-01 11:00:00,2019-04-01 11:10:00,0,3050,2,西多摩店,TK,東京,東京,デリバリー,お渡し済,2019-04-01 11:00:00,2019-04-01 11:10:00,10.0,11,0,平日
3,70546136,8,C90141025,91,75134336,2019-04-01 11:00:00,2019-04-01 11:21:00,1,4744,1,板橋店,TK,東京,東京,お持ち帰り,お支払済,2019-04-01 11:00:00,2019-04-01 11:21:00,21.0,11,0,平日
4,90362883,22,C97382733,28,52868245,2019-04-01 11:00:00,2019-04-01 11:18:00,0,3120,2,国立店,TK,東京,東京,デリバリー,お渡し済,2019-04-01 11:00:00,2019-04-01 11:18:00,18.0,11,0,平日
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232122,62745703,45,C66978086,47,7606269,2019-04-30 21:58:58,2019-04-30 22:15:58,0,2732,2,池尻店,TK,東京,東京,デリバリー,お渡し済,2019-04-30 21:58:58,2019-04-30 22:15:58,17.0,21,1,平日
232123,68418671,36,C52055004,19,6151991,2019-04-30 21:58:58,2019-04-30 22:40:58,0,2363,2,新宿店,TK,東京,東京,デリバリー,お渡し済,2019-04-30 21:58:58,2019-04-30 22:40:58,42.0,21,1,平日
232124,63849953,151,C62174984,24,92252968,2019-04-30 21:58:58,2019-04-30 22:13:58,1,1900,9,八千代店,CH,千葉,千葉,お持ち帰り,キャンセル,2019-04-30 21:58:58,2019-04-30 22:13:58,15.0,21,1,平日
232125,99732064,174,C80733640,4,14064483,2019-04-30 21:58:58,2019-04-30 22:52:58,1,2238,1,新座店,SA,埼玉,埼玉,お持ち帰り,お支払済,2019-04-30 21:58:58,2019-04-30 22:52:58,54.0,21,1,平日


### Knock55: 店舗単位に集計して変数を作成する

In [47]:
# 店舗単位のオーダー数の集計

# 総オーダー数
store_data = order_data.groupby(["store_name"]).count()[["order_id"]]
# 完了しているオーダー数
store_f = order_data.loc[
    (order_data["status_name"] == "お渡し済") | (order_data["status"] == "お支払済")
].groupby(["store_name"]).count()[["order_id"]]
# キャンセル数
store_c = order_data.loc[
    order_data["status_name"] == "キャンセル"
].groupby(["store_name"]).count()[["order_id"]]
# デリバリー数
store_d = order_data.loc[
    order_data["takeout_name"] == "デリバリー"
].groupby(["store_name"]).count()[["order_id"]]
# お持ち帰り数
store_t = order_data.loc[
    order_data["takeout_name"] == "お持ち帰り"
].groupby(["store_name"]).count()[["order_id"]]
# 平日オーダー数
store_weekday = order_data.loc[
    order_data["weekday_info"] == "平日"
].groupby(["store_name"]).count()[["order_id"]]
# 休日オーダー数
store_weekend = order_data.loc[
    order_data["weekday_info"] == "休日"
].groupby(["store_name"]).count()[["order_id"]]


In [48]:
# 時間帯別オーダー数の集計

times = order_data["order_accept_hour"].unique()
store_time = []
for time in times:
    time_tmp = order_data.loc[
        order_data["order_accept_hour"] == time
    ].groupby(["store_name"]).count()[["order_id"]]
    time_tmp.columns = [f"order_time_{time}"]
    store_time.append(time_tmp)

store_time = pd.concat(store_time, axis=1)
display(store_time)


Unnamed: 0_level_0,order_time_11,order_time_12,order_time_13,order_time_14,order_time_15,order_time_16,order_time_17,order_time_18,order_time_19,order_time_20,order_time_21
store_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
あきる野店,91,122,112,101,95,107,106,100,108,109,96
さいたま南店,130,135,147,143,142,137,130,113,140,132,155
さいたま緑店,95,91,106,95,102,82,90,93,95,95,84
さいたま西店,122,101,110,117,105,112,103,112,96,108,98
つくば店,122,119,105,102,128,117,110,107,100,132,125
...,...,...,...,...,...,...,...,...,...,...,...
高津店,102,109,88,107,113,97,99,100,100,111,91
高田馬場店,92,83,78,91,88,103,109,113,105,93,99
鴻巣店,108,89,110,102,110,97,96,100,97,105,94
鶴見店,124,146,125,136,133,148,120,123,133,147,139


In [49]:
# ピザ提供までの時間の集計と集計結果を結合する

# ピザ提供までの時間の平均時間
store_delta = order_data.loc[
    order_data["status_name"] != "キャンセル"
].groupby(["store_name"]).mean()[["delta"]]

# 各データの項目設定
store_data.columns = ["order"]
store_f.columns = ["order_fin"]
store_c.columns = ["order_cancel"]
store_d.columns = ["order_delivery"]
store_t.columns = ["order_takeout"]
store_weekday.columns = ["order_weekly"]
store_weekend.columns = ["order_weekend"]
store_delta.columns = ["delta_avg"]

# 各データを結合
store_data = pd.concat([
    store_data, store_f, store_c, store_d, store_t,
    store_weekday, store_weekend, store_time, store_delta
], axis=1)

display(store_data)

Unnamed: 0_level_0,order,order_fin,order_cancel,order_delivery,order_takeout,order_weekly,order_weekend,order_time_11,order_time_12,order_time_13,order_time_14,order_time_15,order_time_16,order_time_17,order_time_18,order_time_19,order_time_20,order_time_21,delta_avg
store_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
あきる野店,1147,699,202,841,306,844,303,91,122,112,101,95,107,106,100,108,109,96,34.1
さいたま南店,1504,916,287,1105,399,1104,400,130,135,147,143,142,137,130,113,140,132,155,35.3
さいたま緑店,1028,642,181,756,272,756,272,95,91,106,95,102,82,90,93,95,95,84,34.3
さいたま西店,1184,708,204,852,332,870,314,122,101,110,117,105,112,103,112,96,108,98,34.6
つくば店,1267,785,209,928,339,936,331,122,119,105,102,128,117,110,107,100,132,125,34.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
高津店,1117,674,198,809,308,818,299,102,109,88,107,113,97,99,100,100,111,91,34.4
高田馬場店,1054,684,178,814,240,774,280,92,83,78,91,88,103,109,113,105,93,99,36.0
鴻巣店,1108,656,226,805,303,811,297,108,89,110,102,110,97,96,100,97,105,94,34.1
鶴見店,1474,904,248,1072,402,1081,393,124,146,125,136,133,148,120,123,133,147,139,34.2


### Knock56: データの加工と店舗別集計を関数で実行する

In [62]:
# 1か月分データから店舗別に集計する

def data_processing(order_data, m_store, m_area):
    """
    データの加工関数
    店舗ごとに集計する
    """

    # 店舗No.999の除外
    order_data = order_data.loc[order_data["store_id"] != 999]

    # オーダーデータにマスタデータを結合
    order_data = pd.merge(order_data, m_store, on="store_id", how="left")
    order_data = pd.merge(order_data, m_area, on="area_cd", how="left")

    # テイクアウト名の追加
    order_data.loc[order_data["takeout_flag"] == 0, "takeout_name"] = "デリバリー"
    order_data.loc[order_data["takeout_flag"] == 1, "takeout_name"] = "お持ち帰り"

    # ステイタス名の追加
    order_data.loc[order_data["status"] == 0, "status_name"] = "受付"
    order_data.loc[order_data["status"] == 1, "status_name"] = "お支払済"
    order_data.loc[order_data["status"] == 2, "status_name"] = "お渡し済"
    order_data.loc[order_data["status"] == 9, "status_name"] = "キャンセル"

    # ピザ提供までの時間
    order_data.loc[:, "order_accept_datetime"] = pd.to_datetime(
        order_data["order_accept_date"])
    order_data.loc[:, "delivered_datetime"] = pd.to_datetime(
        order_data["delivered_date"])
    order_data.loc[:, "delta"] = order_data[[
        "order_accept_datetime", "delivered_datetime"]].apply(calc_delta, axis=1)

    # ピザが注文された時間帯、平日/休日を追加
    order_data.loc[:, "order_accept_hour"] = order_data["order_accept_datetime"].dt.hour
    order_data.loc[:, "order_accept_weekday"] = order_data["order_accept_datetime"].dt.weekday
    order_data.loc[order_data["order_accept_weekday"] >= 5, "weekday_info"] = "休日"
    order_data.loc[order_data["order_accept_weekday"] < 5, "weekday_info"] = "平日"

    # 店舗単位の集計
    # 総オーダー数
    store_data = order_data.groupby(["store_name"]).count()[["order_id"]]
    # 完了しているオーダー数
    store_f = order_data.loc[
        (order_data["status_name"] == "お渡し済") | (order_data["status"] == "お支払済")
    ].groupby(["store_name"]).count()[["order_id"]]
    # キャンセル数
    store_c = order_data.loc[
        order_data["status_name"] == "キャンセル"
    ].groupby(["store_name"]).count()[["order_id"]]
    # デリバリー数
    store_d = order_data.loc[
        order_data["takeout_name"] == "デリバリー"
    ].groupby(["store_name"]).count()[["order_id"]]
    # お持ち帰り数
    store_t = order_data.loc[
        order_data["takeout_name"] == "お持ち帰り"
    ].groupby(["store_name"]).count()[["order_id"]]
    # 平日オーダー数
    store_weekday = order_data.loc[
        order_data["weekday_info"] == "平日"
    ].groupby(["store_name"]).count()[["order_id"]]
    # 休日オーダー数
    store_weekend = order_data.loc[
        order_data["weekday_info"] == "休日"
    ].groupby(["store_name"]).count()[["order_id"]]

    # 時間帯別オーダー数の集計
    times = order_data["order_accept_hour"].unique()
    store_time = []
    for time in times:
        time_tmp = order_data.loc[
            order_data["order_accept_hour"] == time
        ].groupby(["store_name"]).count()[["order_id"]]
        time_tmp.columns = [f"order_time_{time}"]
        store_time.append(time_tmp)
    store_time = pd.concat(store_time, axis=1)

    # ピザ提供までの時間の平均時間
    store_delta = order_data.loc[
        order_data["status_name"] != "キャンセル"
    ].groupby(["store_name"]).mean()[["delta"]]

    # 各データの項目設定
    store_data.columns = ["order"]
    store_f.columns = ["order_fin"]
    store_c.columns = ["order_cancel"]
    store_d.columns = ["order_delivery"]
    store_t.columns = ["order_takeout"]
    store_weekday.columns = ["order_weekday"]
    store_weekend.columns = ["order_weekend"]
    store_delta.columns = ["delta_avg"]

    # 各データを結合
    store_data = pd.concat([
        store_data, store_f, store_c, store_d, store_t,
        store_weekday, store_weekend, store_time, store_delta
    ], axis=1)

    return store_data


In [63]:
# データ加工関数の実行

tbl_order_path = tbl_order_paths[0]
print(f"読み込みデータ: {tbl_order_path}")
order_data = pd.read_csv(tbl_order_path)
store_data = data_processing(order_data, m_store, m_area)

display(store_data)

読み込みデータ: data/0_input/tbl_order_201904.csv


Unnamed: 0_level_0,order,order_fin,order_cancel,order_delivery,order_takeout,order_weekday,order_weekend,order_time_11,order_time_12,order_time_13,order_time_14,order_time_15,order_time_16,order_time_17,order_time_18,order_time_19,order_time_20,order_time_21,delta_avg
store_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
あきる野店,1147,699,202,841,306,844,303,91,122,112,101,95,107,106,100,108,109,96,34.1
さいたま南店,1504,916,287,1105,399,1104,400,130,135,147,143,142,137,130,113,140,132,155,35.3
さいたま緑店,1028,642,181,756,272,756,272,95,91,106,95,102,82,90,93,95,95,84,34.3
さいたま西店,1184,708,204,852,332,870,314,122,101,110,117,105,112,103,112,96,108,98,34.6
つくば店,1267,785,209,928,339,936,331,122,119,105,102,128,117,110,107,100,132,125,34.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
高津店,1117,674,198,809,308,818,299,102,109,88,107,113,97,99,100,100,111,91,34.4
高田馬場店,1054,684,178,814,240,774,280,92,83,78,91,88,103,109,113,105,93,99,36.0
鴻巣店,1108,656,226,805,303,811,297,108,89,110,102,110,97,96,100,97,105,94,34.1
鶴見店,1474,904,248,1072,402,1081,393,124,146,125,136,133,148,120,123,133,147,139,34.2


### Knock57: 全データの読み込みとデータ加工を行う

In [64]:
# 全データの読み込み
# ピザ提供までの時間の集計と集計結果の結合

store_all = []
for tbl_order_path in tbl_order_paths:
    print(f"読み込みデータ: {tbl_order_path}")
    # YYYYmm
    target_ym = tbl_order_path.split("_")[-1][:6]
    order_data = pd.read_csv(tbl_order_path)
    store_data = data_processing(order_data, m_store, m_area)
    store_data.loc[:, "year_month"] = target_ym
    store_data.reset_index(drop=False, inplace=True)
    store_all.append(store_data)

# データフレーム化
store_all = pd.concat(store_all, ignore_index=True)

# データの保存
store_monthly_name = "store_monthly_data.csv"
store_all.to_csv(os.path.join(output_dir, store_monthly_name), index=False)

# データ表示
display(store_all)

# 2m

読み込みデータ: data/0_input/tbl_order_201904.csv
読み込みデータ: data/0_input/tbl_order_201905.csv
読み込みデータ: data/0_input/tbl_order_201906.csv
読み込みデータ: data/0_input/tbl_order_201907.csv
読み込みデータ: data/0_input/tbl_order_201908.csv
読み込みデータ: data/0_input/tbl_order_201909.csv
読み込みデータ: data/0_input/tbl_order_201910.csv
読み込みデータ: data/0_input/tbl_order_201911.csv
読み込みデータ: data/0_input/tbl_order_201912.csv
読み込みデータ: data/0_input/tbl_order_202001.csv
読み込みデータ: data/0_input/tbl_order_202002.csv
読み込みデータ: data/0_input/tbl_order_202003.csv


Unnamed: 0,store_name,order,order_fin,order_cancel,order_delivery,order_takeout,order_weekday,order_weekend,order_time_11,order_time_12,order_time_13,order_time_14,order_time_15,order_time_16,order_time_17,order_time_18,order_time_19,order_time_20,order_time_21,delta_avg,year_month
0,あきる野店,1147,699,202,841,306,844,303,91,122,112,101,95,107,106,100,108,109,96,34.1,201904
1,さいたま南店,1504,916,287,1105,399,1104,400,130,135,147,143,142,137,130,113,140,132,155,35.3,201904
2,さいたま緑店,1028,642,181,756,272,756,272,95,91,106,95,102,82,90,93,95,95,84,34.3,201904
3,さいたま西店,1184,708,204,852,332,870,314,122,101,110,117,105,112,103,112,96,108,98,34.6,201904
4,つくば店,1267,785,209,928,339,936,331,122,119,105,102,128,117,110,107,100,132,125,34.7,201904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335,高津店,1155,711,220,864,291,821,334,95,120,107,113,103,108,102,91,104,116,96,34.4,202003
2336,高田馬場店,1093,612,236,791,302,777,316,91,115,114,103,87,100,99,93,97,85,109,35.0,202003
2337,鴻巣店,1153,715,202,863,290,821,332,105,107,111,119,89,108,99,106,107,101,101,34.6,202003
2338,鶴見店,1514,896,278,1093,421,1071,443,144,138,118,137,161,152,130,136,138,129,131,34.8,202003


### Knock58: 目的変数を作成する

- 目的変数
  - 前月に対してオーダー数が上がったか下がったか
- 平日・休日で別モデルを作る

In [65]:
# 1ヶ月前の日付の作成

y = store_all[["store_name", "year_month", "order_weekday", "order_weekend"]].copy()
y.loc[:, "one_month_ago"] = pd.to_datetime(y["year_month"], format="%Y%m")
y.loc[:, "one_month_ago"] = y["one_month_ago"].map(lambda x: x - relativedelta(months=1))
y.loc[:, "one_month_ago"] = y["one_month_ago"].dt.strftime("%Y%m")

display(y)


Unnamed: 0,store_name,year_month,order_weekday,order_weekend,one_month_ago
0,あきる野店,201904,844,303,201903
1,さいたま南店,201904,1104,400,201903
2,さいたま緑店,201904,756,272,201903
3,さいたま西店,201904,870,314,201903
4,つくば店,201904,936,331,201903
...,...,...,...,...,...
2335,高津店,202003,821,334,202002
2336,高田馬場店,202003,777,316,202002
2337,鴻巣店,202003,821,332,202002
2338,鶴見店,202003,1071,443,202002


In [66]:
# 1か月前のオーダー数の作成

y_one_month_ago = y.copy()
y_one_month_ago.rename(
    columns={
        "order_weekday": "order_weekday_one_month_ago",
        "order_weekend": "order_weekend_one_month_ago",
        "year_month": "year_month_for_join"
    },
    inplace=True
)
y = pd.merge(
    y,
    y_one_month_ago[[
        "store_name", "year_month_for_join",
        "order_weekday_one_month_ago", "order_weekend_one_month_ago"
    ]],
    left_on=["store_name", "one_month_ago"],
    right_on=["store_name", "year_month_for_join"],
    how="left"
)

In [67]:
y.loc[y["store_name"] == "あきる野店"]

Unnamed: 0,store_name,year_month,order_weekday,order_weekend,one_month_ago,year_month_for_join,order_weekday_one_month_ago,order_weekend_one_month_ago
0,あきる野店,201904,844,303,201903,,,
195,あきる野店,201905,883,302,201904,201904.0,844.0,303.0
390,あきる野店,201906,764,384,201905,201905.0,883.0,302.0
585,あきる野店,201907,882,308,201906,201906.0,764.0,384.0
780,あきる野店,201908,835,343,201907,201907.0,882.0,308.0
975,あきる野店,201909,802,347,201908,201908.0,835.0,343.0
1170,あきる野店,201910,880,309,201909,201909.0,802.0,347.0
1365,あきる野店,201911,796,341,201910,201910.0,880.0,309.0
1560,あきる野店,201912,844,345,201911,201911.0,796.0,341.0
1755,あきる野店,202001,881,305,201912,201912.0,844.0,345.0


In [68]:
# 欠損値の除去

y.dropna(inplace=True)

# オーダー数増減のフラグ項目作成

# オーダー数前月差プラス: 1（平日モデル）
y.loc[
    y["order_weekday"] - y["order_weekday_one_month_ago"] > 0, "y_weekday"
] = 1
# オーダー数前月差マイナス: 0（平日モデル）
y.loc[
    y["order_weekday"] - y["order_weekday_one_month_ago"] <= 0, "y_weekday"
] = 0
# オーダー数前月差プラス: 1（休日モデル）
y.loc[
    y["order_weekend"] - y["order_weekend_one_month_ago"] > 0, "y_weekend"
] = 1
# オーダー数前月差マイナス: 0（休日モデル）
y.loc[
    y["order_weekend"] - y["order_weekend_one_month_ago"] <= 0, "y_weekend"
] = 0

display(y)

Unnamed: 0,store_name,year_month,order_weekday,order_weekend,one_month_ago,year_month_for_join,order_weekday_one_month_ago,order_weekend_one_month_ago,y_weekday,y_weekend
195,あきる野店,201905,883,302,201904,201904,844.0,303.0,1.0,0.0
196,さいたま南店,201905,1152,401,201904,201904,1104.0,400.0,1.0,1.0
197,さいたま緑店,201905,796,274,201904,201904,756.0,272.0,1.0,1.0
198,さいたま西店,201905,902,312,201904,201904,870.0,314.0,1.0,0.0
199,つくば店,201905,967,338,201904,201904,936.0,331.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
2335,高津店,202003,821,334,202002,202002,741.0,339.0,1.0,0.0
2336,高田馬場店,202003,777,316,202002,202002,709.0,318.0,1.0,0.0
2337,鴻巣店,202003,821,332,202002,202002,745.0,329.0,1.0,1.0
2338,鶴見店,202003,1071,443,202002,202002,985.0,439.0,1.0,1.0
