In [46]:
# 共通で利用するライブラリ
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, precision=3)
pd.options.display.float_format = '{:.3f}'.format
pd.set_option("display.max_columns", None)
plt.style.use('seaborn')
plt.rcParams["font.size"] = 14
plt.rcParams['font.family'] = 'IPAexGothic'

# 追加するライブラリ
import os
import glob
from natsort import natsorted

## Ch.01 分析に向けた準備を行う
## <span style='color:Yellow'> Ch.01 分析に向けた準備を行う </span>
## <span style='background:yellow'> Ch.01 分析に向けた準備を行う </span>
- マスタデータ
  - 値が一意になる固定的なデータ
- トランザクションデータ
  - 流動的なデータ

前提条件

- m_area.csv
  - 地域マスタ
  - 都道府県情報等
- m_store.csv
  - 店舗マスタ
  - 店舗名等
- tbl_order_202004.csv
  - 注文データ
  - 4月分
- tbl_order_202005.csv
  - 注文データ
  - 5月分
- tbl_order_202006.csv
  - 注文データ
  - 6月分

「顧客マスタ」、「注文詳細データ」は除外

### Knock01: データをすべて読み込む

In [47]:
# m_store.csvの読み込み

m_store = pd.read_csv("../support/本章/1章/m_store.csv")
print(m_store.shape)
display(m_store)

(197, 3)


Unnamed: 0,store_id,store_name,area_cd
0,1,昭島店,TK
1,2,あきる野店,TK
2,3,足立店,TK
3,4,北千住店,TK
4,5,綾瀬店,TK
...,...,...,...
192,193,桐生店,GU
193,194,高崎店,GU
194,195,館林店,GU
195,196,前橋店,GU


In [48]:
# m_area.csvの読み込み

m_area = pd.read_csv("../support/本章/1章/m_area.csv")
print(m_area.shape)
display(m_area)

(7, 3)


Unnamed: 0,area_cd,wide_area,narrow_area
0,TK,東京,東京
1,KN,神奈川,神奈川
2,CH,千葉,千葉
3,SA,埼玉,埼玉
4,IB,北関東,茨城
5,TO,北関東,栃木
6,GU,北関東,群馬


In [49]:
# オーダーデータの読み込み

tbl_order_4 = pd.read_csv("../support/本章/1章/tbl_order_202004.csv")
tbl_order_5 = pd.read_csv("../support/本章/1章/tbl_order_202005.csv")
tbl_order_6 = pd.read_csv("../support/本章/1章/tbl_order_202006.csv")

print(tbl_order_4.shape)
display(tbl_order_4)
print(tbl_order_5.shape)
display(tbl_order_5)
print(tbl_order_6.shape)
display(tbl_order_6)


(233260, 10)


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2
...,...,...,...,...,...,...,...,...,...,...
233255,25546308,3,C24617924,46,3700904,2020-04-30 21:58:58,2020-04-30 22:19:58,0,2615,2
233256,91693481,45,C68997855,6,37852755,2020-04-30 21:58:58,2020-04-30 22:13:58,0,2732,2
233257,57908119,36,C16123702,81,90372382,2020-04-30 21:58:58,2020-04-30 22:47:58,0,2363,2
233258,7145625,174,C80733640,75,48265986,2020-04-30 21:58:58,2020-04-30 22:32:58,0,2238,2


(241139, 10)


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,24742472,22,C90731046,22,80308413,2020-05-01 11:00:00,2020-05-01 11:22:00,1,2328,1
1,11342919,56,C68678177,12,35725624,2020-05-01 11:00:00,2020-05-01 11:50:00,0,2987,2
2,76451864,56,C79630016,46,2609869,2020-05-01 11:00:00,2020-05-01 11:32:00,0,2603,2
3,28392539,192,C83895707,80,17111162,2020-05-01 11:00:00,2020-05-01 11:19:00,0,2603,2
4,71706591,121,C36584528,93,22800835,2020-05-01 11:00:00,2020-05-01 11:17:00,0,2877,2
...,...,...,...,...,...,...,...,...,...,...
241134,47039360,154,C39765066,90,27121351,2020-05-31 21:58:58,2020-05-31 22:42:58,1,2363,1
241135,61054188,94,C19465010,41,61513298,2020-05-31 21:58:58,2020-05-31 22:40:58,0,2112,2
241136,45288914,11,C23883838,40,65359139,2020-05-31 21:58:58,2020-05-31 22:38:58,0,3838,2
241137,41608372,149,C48350551,43,42138977,2020-05-31 21:58:58,2020-05-31 22:52:58,0,1899,2


(233301, 10)


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,16514392,64,C44271355,49,56527968,2020-06-01 11:00:00,2020-06-01 11:55:00,0,2732,2
1,28342394,12,C98307866,98,46924304,2020-06-01 11:00:00,2020-06-01 11:52:00,0,2064,2
2,29799587,174,C59484037,45,47054474,2020-06-01 11:00:00,2020-06-01 11:25:00,0,3900,2
3,96960753,23,C57278332,70,60848267,2020-06-01 11:00:00,2020-06-01 11:45:00,0,2750,2
4,52147209,7,C36890849,80,20633593,2020-06-01 11:00:00,2020-06-01 11:23:00,0,2328,2
...,...,...,...,...,...,...,...,...,...,...
233296,90872494,191,C35992970,46,51884378,2020-06-30 21:58:58,2020-06-30 22:43:58,1,2112,1
233297,30167637,35,C53126526,88,6295273,2020-06-30 21:58:58,2020-06-30 22:36:58,0,4462,2
233298,85345862,118,C25099070,32,15733308,2020-06-30 21:58:58,2020-06-30 22:42:58,0,3865,2
233299,73038887,100,C27421314,84,67608099,2020-06-30 21:58:58,2020-06-30 22:21:58,0,3319,2


- 分析る際は、なるべく粒度の細かいデータを基準にする


- 注文データをベースに考える場合
  - 月ごとに分割された「tbl_order」を縦に結合する「ユニオン」
  - 「tbl_order」に「m_store」と「m_area」を横に結合する「ジョイン」

### Knock02: データを結合（ユニオン）する

In [50]:
# 注文データの「4月分」と「5月分」を縦結合（ユニオン）する

order_all = pd.concat([tbl_order_4, tbl_order_5], axis=0, ignore_index=True)
print(order_all.shape)
display(order_all)
len(order_all) == len(tbl_order_4) + len(tbl_order_5)

(474399, 10)


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2
...,...,...,...,...,...,...,...,...,...,...
474394,47039360,154,C39765066,90,27121351,2020-05-31 21:58:58,2020-05-31 22:42:58,1,2363,1
474395,61054188,94,C19465010,41,61513298,2020-05-31 21:58:58,2020-05-31 22:40:58,0,2112,2
474396,45288914,11,C23883838,40,65359139,2020-05-31 21:58:58,2020-05-31 22:38:58,0,3838,2
474397,41608372,149,C48350551,43,42138977,2020-05-31 21:58:58,2020-05-31 22:52:58,0,1899,2


True

### Knock03: フォルダ内のファイル名を一覧化する

In [51]:
# カレントディレクトリの表示

current_dir = os.getcwd()
current_dir

'/Users/takeru/Library/CloudStorage/OneDrive-個人用/Learn/Python/python-practical-machine-learning-system-100knocks/ch01'

In [52]:
# カレントディレクトリの内容を一覧表示

os.listdir(current_dir)

['ch01.ipynb']

In [53]:
# 指定したディレクトリの検索キーの設定

tbl_order_file = os.path.join("../support/本章/1章/tbl_order_*.csv")
tbl_order_file

'../support/本章/1章/tbl_order_*.csv'

In [54]:
# 指定したディレクトリの注文データを一覧表示

tbl_order_files = glob.glob(tbl_order_file)
tbl_order_files

['../support/本章/1章/tbl_order_202006.csv',
 '../support/本章/1章/tbl_order_202004.csv',
 '../support/本章/1章/tbl_order_202005.csv']

In [55]:
# 自然順に並んだファイル名をファイル番号順並べ替える

tbl_order_files = natsorted(tbl_order_files)
tbl_order_files

['../support/本章/1章/tbl_order_202004.csv',
 '../support/本章/1章/tbl_order_202005.csv',
 '../support/本章/1章/tbl_order_202006.csv']

### Knock04: 複数データを結合（ユニオン）する

In [56]:
# リストの1つ目のファイルを指定した処理

order_all = pd.DataFrame()
file = tbl_order_files[0]
order_data = pd.read_csv(file)
print(f"{file}: {len(order_data)}行")
order_all = pd.concat([order_all, order_data], axis=0, ignore_index=True)
display(order_all)

../support/本章/1章/tbl_order_202004.csv: 233260行


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2
...,...,...,...,...,...,...,...,...,...,...
233255,25546308,3,C24617924,46,3700904,2020-04-30 21:58:58,2020-04-30 22:19:58,0,2615,2
233256,91693481,45,C68997855,6,37852755,2020-04-30 21:58:58,2020-04-30 22:13:58,0,2732,2
233257,57908119,36,C16123702,81,90372382,2020-04-30 21:58:58,2020-04-30 22:47:58,0,2363,2
233258,7145625,174,C80733640,75,48265986,2020-04-30 21:58:58,2020-04-30 22:32:58,0,2238,2


In [57]:
# 繰り返し処理の実行

order_all = pd.DataFrame()
row_num = 0
for file in tbl_order_files:
    order_data = pd.read_csv(file)
    row_num += len(order_data)
    print(f"{file}: {len(order_data)} 行")
    order_all = pd.concat([order_all, order_data], axis=0, ignore_index=True)

print(len(order_all) == row_num)
display(order_all)

../support/本章/1章/tbl_order_202004.csv: 233260 行
../support/本章/1章/tbl_order_202005.csv: 241139 行
../support/本章/1章/tbl_order_202006.csv: 233301 行
True


Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2
...,...,...,...,...,...,...,...,...,...,...
707695,90872494,191,C35992970,46,51884378,2020-06-30 21:58:58,2020-06-30 22:43:58,1,2112,1
707696,30167637,35,C53126526,88,6295273,2020-06-30 21:58:58,2020-06-30 22:36:58,0,4462,2
707697,85345862,118,C25099070,32,15733308,2020-06-30 21:58:58,2020-06-30 22:42:58,0,3865,2
707698,73038887,100,C27421314,84,67608099,2020-06-30 21:58:58,2020-06-30 22:21:58,0,3319,2


### Knock05: データの統計量を確認する

In [58]:
# 注文データの欠損値の確認

order_all.isnull().sum()

order_id             0
store_id             0
customer_id          0
coupon_cd            0
sales_detail_id      0
order_accept_date    0
delivered_date       0
takeout_flag         0
total_amount         0
status               0
dtype: int64

In [59]:
# 注文データの統計量の確認

order_all.describe()

Unnamed: 0,order_id,store_id,coupon_cd,sales_detail_id,takeout_flag,total_amount,status
count,707700.0,707700.0,707700.0,707700.0,707700.0,707700.0,707700.0
mean,50009886.448,103.935,49.514,50003347.665,0.26,2960.652,3.083
std,28892257.233,86.374,28.883,28855751.054,0.439,954.379,2.836
min,22.0,1.0,0.0,46.0,0.0,698.0,1.0
25%,25012471.75,51.0,25.0,25045019.0,0.0,2308.0,2.0
50%,49996549.0,99.0,49.0,50045057.5,0.0,2808.0,2.0
75%,75080144.75,148.0,75.0,74997067.5,1.0,3617.0,2.0
max,99999799.0,999.0,99.0,99999647.0,1.0,5100.0,9.0


In [60]:
# 「total_amount」の統計量

order_all["total_amount"].describe()

count   707700.000
mean      2960.652
std        954.379
min        698.000
25%       2308.000
50%       2808.000
75%       3617.000
max       5100.000
Name: total_amount, dtype: float64

In [61]:
# 日付の最小・最大値を確認

print(order_all["order_accept_date"].min())
print(order_all["order_accept_date"].max())
print(order_all["delivered_date"].min())
print(order_all["delivered_date"].max())


2020-04-01 11:00:00
2020-06-30 21:58:58
2020-04-01 11:10:00
2020-06-30 22:55:56


### Knock06: 不要なデータを除外する

In [62]:
# 「store_id == 999」のデータを除外する

order_data = order_all.loc[order_all["store_id"] != 999]
display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2
...,...,...,...,...,...,...,...,...,...,...
707695,90872494,191,C35992970,46,51884378,2020-06-30 21:58:58,2020-06-30 22:43:58,1,2112,1
707696,30167637,35,C53126526,88,6295273,2020-06-30 21:58:58,2020-06-30 22:36:58,0,4462,2
707697,85345862,118,C25099070,32,15733308,2020-06-30 21:58:58,2020-06-30 22:42:58,0,3865,2
707698,73038887,100,C27421314,84,67608099,2020-06-30 21:58:58,2020-06-30 22:21:58,0,3319,2


In [63]:
# 注文データの統計量の確認

order_data.describe()

Unnamed: 0,order_id,store_id,coupon_cd,sales_detail_id,takeout_flag,total_amount,status
count,703880.0,703880.0,703880.0,703880.0,703880.0,703880.0,703880.0
mean,50016732.753,99.077,49.516,50000035.672,0.26,2959.949,3.083
std,28893389.525,55.943,28.883,28854636.667,0.439,954.367,2.836
min,22.0,1.0,0.0,46.0,0.0,698.0,1.0
25%,25015701.5,51.0,25.0,25040344.75,0.0,2308.0,2.0
50%,50009303.0,99.0,49.0,50042187.5,0.0,2808.0,2.0
75%,75086124.25,147.0,75.0,74991725.0,1.0,3617.0,2.0
max,99999799.0,196.0,99.0,99999647.0,1.0,5100.0,9.0


In [64]:
# 「total_amount」の統計量

order_data["total_amount"].describe()

count   703880.000
mean      2959.949
std        954.367
min        698.000
25%       2308.000
50%       2808.000
75%       3617.000
max       5100.000
Name: total_amount, dtype: float64

In [65]:
# 日付の最小・最大値を確認

print(order_data["order_accept_date"].min())
print(order_data["order_accept_date"].max())
print(order_data["delivered_date"].min())
print(order_data["delivered_date"].max())

2020-04-01 11:00:00
2020-06-30 21:58:58
2020-04-01 11:10:00
2020-06-30 22:55:56


### Knock07: マスタデータを結合（ジョイン）する

In [66]:
# 店舗マスタのジョイン

order_data = pd.merge(
    order_data, m_store,
    on="store_id",
    how="left"
)
display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name,area_cd
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1,浅草店,TK
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2,目黒店,TK
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2,本郷店,TK
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2,栃木店,TO
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2,伊勢崎店,GU
...,...,...,...,...,...,...,...,...,...,...,...,...
703875,90872494,191,C35992970,46,51884378,2020-06-30 21:58:58,2020-06-30 22:43:58,1,2112,1,伊勢崎店,GU
703876,30167637,35,C53126526,88,6295273,2020-06-30 21:58:58,2020-06-30 22:36:58,0,4462,2,代々木店,TK
703877,85345862,118,C25099070,32,15733308,2020-06-30 21:58:58,2020-06-30 22:42:58,0,3865,2,磯子店,KN
703878,73038887,100,C27421314,84,67608099,2020-06-30 21:58:58,2020-06-30 22:21:58,0,3319,2,中原店,KN


In [67]:
# エリアマスタのジョイン

order_data = pd.merge(
    order_data, m_area,
    on="area_cd",
    how="left"
)
display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name,area_cd,wide_area,narrow_area
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1,浅草店,TK,東京,東京
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2,目黒店,TK,東京,東京
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2,本郷店,TK,東京,東京
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2,栃木店,TO,北関東,栃木
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2,伊勢崎店,GU,北関東,群馬
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703875,90872494,191,C35992970,46,51884378,2020-06-30 21:58:58,2020-06-30 22:43:58,1,2112,1,伊勢崎店,GU,北関東,群馬
703876,30167637,35,C53126526,88,6295273,2020-06-30 21:58:58,2020-06-30 22:36:58,0,4462,2,代々木店,TK,東京,東京
703877,85345862,118,C25099070,32,15733308,2020-06-30 21:58:58,2020-06-30 22:42:58,0,3865,2,磯子店,KN,神奈川,神奈川
703878,73038887,100,C27421314,84,67608099,2020-06-30 21:58:58,2020-06-30 22:21:58,0,3319,2,中原店,KN,神奈川,神奈川


### Knock08: マスタが存在しないコードに名称を設定する

- takeout_flag
  - 0: デリバリー
  - 1: お持ち帰り
- status
  - 0: 受付
  - 1: お支払い済
  - 2: お渡し済
  - 9: キャンセル

In [68]:
# takeout_flagの名称を設定

order_data.loc[order_data["takeout_flag"] == 0, "takeout_name"] = "デリバリー"
order_data.loc[order_data["takeout_flag"] == 1, "takeout_name"] = "お持ち帰り"
display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name,area_cd,wide_area,narrow_area,takeout_name
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1,浅草店,TK,東京,東京,お持ち帰り
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2,目黒店,TK,東京,東京,デリバリー
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2,本郷店,TK,東京,東京,デリバリー
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2,栃木店,TO,北関東,栃木,デリバリー
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2,伊勢崎店,GU,北関東,群馬,デリバリー
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703875,90872494,191,C35992970,46,51884378,2020-06-30 21:58:58,2020-06-30 22:43:58,1,2112,1,伊勢崎店,GU,北関東,群馬,お持ち帰り
703876,30167637,35,C53126526,88,6295273,2020-06-30 21:58:58,2020-06-30 22:36:58,0,4462,2,代々木店,TK,東京,東京,デリバリー
703877,85345862,118,C25099070,32,15733308,2020-06-30 21:58:58,2020-06-30 22:42:58,0,3865,2,磯子店,KN,神奈川,神奈川,デリバリー
703878,73038887,100,C27421314,84,67608099,2020-06-30 21:58:58,2020-06-30 22:21:58,0,3319,2,中原店,KN,神奈川,神奈川,デリバリー


In [69]:
# statusの名称設定

order_data.loc[order_data["status"] == 0, "status_name"] = "受付"
order_data.loc[order_data["status"] == 1, "status_name"] = "お支払い済"
order_data.loc[order_data["status"] == 2, "status_name"] = "お渡し済"
order_data.loc[order_data["status"] == 9, "status_name"] = "キャンセル"
display(order_data)

Unnamed: 0,order_id,store_id,customer_id,coupon_cd,sales_detail_id,order_accept_date,delivered_date,takeout_flag,total_amount,status,store_name,area_cd,wide_area,narrow_area,takeout_name,status_name
0,79339111,49,C26387220,50,67393872,2020-04-01 11:00:00,2020-04-01 11:18:00,1,4144,1,浅草店,TK,東京,東京,お持ち帰り,お支払い済
1,18941733,85,C48773811,26,91834983,2020-04-01 11:00:00,2020-04-01 11:22:00,0,2877,2,目黒店,TK,東京,東京,デリバリー,お渡し済
2,56217880,76,C66287421,36,64409634,2020-04-01 11:00:00,2020-04-01 11:15:00,0,2603,2,本郷店,TK,東京,東京,デリバリー,お渡し済
3,28447783,190,C41156423,19,73032165,2020-04-01 11:00:00,2020-04-01 11:16:00,0,2732,2,栃木店,TO,北関東,栃木,デリバリー,お渡し済
4,32576156,191,C54568117,71,23281182,2020-04-01 11:00:00,2020-04-01 11:53:00,0,2987,2,伊勢崎店,GU,北関東,群馬,デリバリー,お渡し済
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703875,90872494,191,C35992970,46,51884378,2020-06-30 21:58:58,2020-06-30 22:43:58,1,2112,1,伊勢崎店,GU,北関東,群馬,お持ち帰り,お支払い済
703876,30167637,35,C53126526,88,6295273,2020-06-30 21:58:58,2020-06-30 22:36:58,0,4462,2,代々木店,TK,東京,東京,デリバリー,お渡し済
703877,85345862,118,C25099070,32,15733308,2020-06-30 21:58:58,2020-06-30 22:42:58,0,3865,2,磯子店,KN,神奈川,神奈川,デリバリー,お渡し済
703878,73038887,100,C27421314,84,67608099,2020-06-30 21:58:58,2020-06-30 22:21:58,0,3319,2,中原店,KN,神奈川,神奈川,デリバリー,お渡し済


### Knock09: 分析基礎テーブルを出力する

In [75]:
# 出力フォルダの作成

output_dir = os.path.join("../", "output_data")
# exist_ok=True -> FileExistsError（ファイル存在エラー）回避
os.makedirs(output_dir, exist_ok=True)

In [76]:
# ファイルの出力

output_file = os.path.join(output_dir, "order_data.csv")
order_data.to_csv(output_file, index=False)

### Knock10: セルを整理して使いやすくする

- これまでの処理
  - 読み込み
  - 加工
  - 出力

一連の処理を整理する

In [88]:
# ライブラリのインポート
import pandas as pd
import os
import glob
from natsort import natsorted

# ディレクトリパスの設定
input_data_path = "../support/本章/1章"
output_data_path = "../"

# 各マスタデータの読み込み
m_store = pd.read_csv(input_data_path + "/m_store.csv")
m_area = pd.read_csv(input_data_path + "/m_area.csv")

# オーダーデータの読み込み
tbl_order_file = os.path.join(input_data_path, "tbl_order_*.csv")
tbl_order_files = glob.glob(tbl_order_file)
tbl_order_files = natsorted(tbl_order_files)
order_all = pd.DataFrame()

for file in tbl_order_files:
    order_data = pd.read_csv(file)
    print(f"{file}: {len(order_data)} 行")
    order_all = pd.concat([order_all, order_data], axis=0, ignore_index=True)

# 不要なデータを除外
order_data = order_all.loc[order_all["store_id"] != 999]

# マスタデータの結合
order_data = pd.merge(order_data, m_store, on="store_id", how="left")
order_data = pd.merge(order_data, m_area, on="area_cd", how="left")

# 名称の設定（お渡し方法）
order_data.loc[order_data["takeout_flag"] == 0, "takeout_name"] = "デリバリー"
order_data.loc[order_data["takeout_flag"] == 1, "takeout_name"] = "お持ち帰り"

# 名称の設定（注文状態）
order_data.loc[order_data["status"] == 0, "status_name"] = "受付"
order_data.loc[order_data["status"] == 1, "status_name"] = "お支払い済"
order_data.loc[order_data["status"] == 2, "status_name"] = "お渡し済"
order_data.loc[order_data["status"] == 9, "status_name"] = "キャンセル"

# ファイルの出力
output_dir = os.path.join(output_data_path, "output_data")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "order_data.csv")
order_data.to_csv(output_file, index=False)

../support/本章/1章/tbl_order_202004.csv: 233260 行
../support/本章/1章/tbl_order_202005.csv: 241139 行
../support/本章/1章/tbl_order_202006.csv: 233301 行


In [85]:
tbl_order_files


['../support/本章/1章/tbl_order_202004.csv',
 '../support/本章/1章/tbl_order_202005.csv',
 '../support/本章/1章/tbl_order_202006.csv']