In [92]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, precision=4)
pd.options.display.float_format = '{:.4f}'.format
pd.set_option("display.max_columns", None)
plt.style.use('seaborn')
plt.rcParams["font.size"] = 14
plt.rcParams['font.family'] = 'IPAexGothic'

# ２章　小売店のデータでデータ加工を行う１０本ノック

ある小売店の売上履歴と顧客台帳データを用いて、データ分析の素地となる「データの加工」を習得する。
実際の現場データは手入力のExcel等、決して綺麗なデータではない事が多いため、
データの揺れや整合性の担保など、汚いデータを取り扱うデータ加工を主体に進めて行う。

### Knock11: データを読み込む

- uriage.csv
  - 売上履歴
  - 2019-01~2019-07
- kokyaku_daicho.xlsx
  - 手入力で店舗が管理している顧客台帳

In [93]:
# データの読み込み

uriage_data = pd.read_csv("../support/2章/uriage.csv")
kokyaku_data = pd.read_excel("../support/2章/kokyaku_daicho.xlsx")

In [94]:
# データの一部表示

# 売上履歴
print(uriage_data.shape)
display(uriage_data.head())
display(uriage_data.tail())

# 顧客台帳
print(kokyaku_data.shape)
display(kokyaku_data.head())
display(kokyaku_data.tail())

(2999, 4)


Unnamed: 0,purchase_date,item_name,item_price,customer_name
0,2019-06-13 18:02:34,商品A,100.0,深井菜々美
1,2019-07-13 13:05:29,商 品 S,,浅田賢二
2,2019-05-11 19:42:07,商 品 a,,南部慶二
3,2019-02-12 23:40:45,商品Z,2600.0,麻生莉緒
4,2019-04-22 03:09:35,商品a,,平田鉄二


Unnamed: 0,purchase_date,item_name,item_price,customer_name
2994,2019-02-15 02:56:39,商品Y,2500.0,福島友也
2995,2019-06-22 04:03:43,商品M,1300.0,大倉晃司
2996,2019-03-29 11:14:05,商品Q,,尾形小雁
2997,2019-07-14 12:56:49,商品H,,芦田博之
2998,2019-07-21 00:31:36,商品D,400.0,石田郁恵


(200, 5)


Unnamed: 0,顧客名,かな,地域,メールアドレス,登録日
0,須賀ひとみ,すが ひとみ,H市,suga_hitomi@example.com,2018/01/04
1,岡田 敏也,おかだ としや,E市,okada_toshiya@example.com,42782
2,芳賀 希,はが のぞみ,A市,haga_nozomi@example.com,2018/01/07
3,荻野 愛,おぎの あい,F市,ogino_ai@example.com,42872
4,栗田 憲一,くりた けんいち,E市,kurita_kenichi@example.com,43127


Unnamed: 0,顧客名,かな,地域,メールアドレス,登録日
195,川上 りえ,かわかみ りえ,G市,kawakami_rie@example.com,2017/06/20
196,小松 季衣,こまつ としえ,E市,komatsu_toshie@example.com,2018/06/20
197,白鳥 りえ,しらとり りえ,F市,shiratori_rie@example.com,2017/04/29
198,大西 隆之介,おおにし りゅうのすけ,H市,oonishi_ryuunosuke@example.com,2019/04/19
199,福井 美希,ふくい みき,D市,fukui_miki1@example.com,2019/04/23


### Knock12: データの揺れをみる

In [95]:
# データの揺れ（商品名）

print(uriage_data["item_name"].head())
print(uriage_data["item_name"].tail())

0      商品A
1    商 品 S
2    商 品 a
3      商品Z
4      商品a
Name: item_name, dtype: object
2994    商品Y
2995    商品M
2996    商品Q
2997    商品H
2998    商品D
Name: item_name, dtype: object


In [96]:
# データの揺れ（商品金額）

print(uriage_data["item_price"].head())
print(uriage_data["item_price"].tail())

0    100.0000
1         NaN
2         NaN
3   2600.0000
4         NaN
Name: item_price, dtype: float64
2994   2500.0000
2995   1300.0000
2996         NaN
2997         NaN
2998    400.0000
Name: item_price, dtype: float64


### Knock13: データに揺れがあるまま集計する

In [97]:
# datetime型への変換
# 年月列の追加

uriage_data["purchase_date"] = pd.to_datetime(uriage_data["purchase_date"])
uriage_data["purchase_month"] = uriage_data["purchase_date"].dt.strftime("%Y%m")

In [98]:
# データ補正前の集計
# 月別、商品ごとの販売個数

res = uriage_data.pivot_table(
    index='purchase_month',
    columns='item_name',
    aggfunc='size',
    fill_value=0
)
print(res.shape)
display(res)

(7, 99)


item_name,商品W,商 品 n,商品E,商品M,商品P,商品S,商品W,商品X,商 品O,商 品Q,商 品T,商 品V,商 品 S,商 品 a,商 品 q,商 品 s,商 品A,商 品C,商 品D,商 品E,商 品F,商 品G,商 品H,商 品I,商 品K,商 品M,商 品N,商 品O,商 品P,商 品T,商 品U,商 品V,商 品X,商 品Y,商 品s,商品 A,商品 B,商品 E,商品 F,商品 H,商品 I,商品 J,商品 K,商品 M,商品 O,商品 Q,商品 R,商品 S,商品 T,商品 V,商品 X,商品 a,商品 g,商品 o,商品 v,商品 w,商品A,商品B,商品C,商品D,商品E,商品F,商品G,商品H,商品I,商品J,商品K,商品L,商品M,商品N,商品O,商品P,商品Q,商品R,商品S,商品T,商品U,商品V,商品W,商品X,商品Y,商品Z,商品a,商品c,商品d,商品e,商品g,商品i,商品j,商品k,商品l,商品o,商品p,商品r,商品s,商品t,商品v,商品x,商品y
purchase_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
201901,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,2,0,1,0,0,0,0,0,1,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,18,13,18,17,18,15,11,15,17,17,19,18,18,15,21,15,17,21,18,16,7,21,13,12,10,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
201902,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,19,13,26,20,16,14,13,17,11,13,16,11,15,19,18,19,22,21,21,22,19,21,24,14,11,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0
201903,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,17,21,20,16,8,26,14,18,12,15,14,20,21,13,11,20,23,16,20,12,23,17,16,21,16,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
201904,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,19,24,20,17,15,13,11,18,13,14,15,11,20,14,15,19,20,15,15,11,14,13,16,17,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0
201905,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,1,20,14,15,14,19,17,23,15,16,10,16,12,17,19,18,20,12,22,16,15,16,8,20,16,19,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
201906,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,23,12,10,19,13,18,13,12,18,22,15,15,17,16,14,17,15,16,21,12,17,19,16,14,13,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
201907,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,20,20,17,17,10,17,19,18,18,23,12,17,11,15,19,26,14,17,22,20,13,26,16,18,12,0,0,0,0,0,0,1,0,0,0,1,0,2,0,0,0,0,0


In [99]:
# データ補正前の集計
# 月別、商品ごとの販売金額

res = uriage_data.pivot_table(
    index='purchase_month',
    columns='item_name',
    values='item_price',
    aggfunc='sum',
    fill_value=0
)
print(res.shape)
display(res)


(7, 99)


item_name,商品W,商 品 n,商品E,商品M,商品P,商品S,商品W,商品X,商 品O,商 品Q,商 品T,商 品V,商 品 S,商 品 a,商 品 q,商 品 s,商 品A,商 品C,商 品D,商 品E,商 品F,商 品G,商 品H,商 品I,商 品K,商 品M,商 品N,商 品O,商 品P,商 品T,商 品U,商 品V,商 品X,商 品Y,商 品s,商品 A,商品 B,商品 E,商品 F,商品 H,商品 I,商品 J,商品 K,商品 M,商品 O,商品 Q,商品 R,商品 S,商品 T,商品 V,商品 X,商品 a,商品 g,商品 o,商品 v,商品 w,商品A,商品B,商品C,商品D,商品E,商品F,商品G,商品H,商品I,商品J,商品K,商品L,商品M,商品N,商品O,商品P,商品Q,商品R,商品S,商品T,商品U,商品V,商品W,商品X,商品Y,商品Z,商品a,商品c,商品d,商品e,商品g,商品i,商品j,商品k,商品l,商品o,商品p,商品r,商品s,商品t,商品v,商品x,商品y
purchase_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
201901,0,1400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,800,0,0,0,0,1500,1600,2000,0,0,4800,0,1900,0,0,0,0,0,900,0,0,1300,1500,0,0,1900,0,2200,0,0,0,0,0,0,1500,1600,5100,6000,6500,7800,7000,10400,13500,16000,16500,19200,19500,21000,27000,20800,25500,36000,28500,28000,12600,41800,27600,24000,20000,0,0,300,0,0,0,0,0,1100,1200,1500,0,0,0,0,0,0,0
201902,0,0,0,0,0,0,0,2400,0,0,0,0,0,0,0,0,0,0,400,0,0,0,0,900,0,0,1400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1800,0,0,0,2400,0,700,1500,0,0,1700,2400,6000,7600,5500,6600,7000,12800,8100,11000,14300,13200,16900,23800,25500,30400,30600,32400,32300,40000,37800,39600,43700,31200,20000,2600,0,0,0,0,0,0,1000,0,0,0,0,0,1900,2000,2200,0,0
201903,0,0,500,1300,1600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1300,4200,5100,6000,4000,11400,7700,14400,10800,13000,14300,21600,26000,16800,15000,32000,34000,27000,26600,22000,46200,35200,34500,38400,35000,0,0,0,400,0,0,0,0,0,0,0,0,0,0,0,0,0,0
201904,2300,0,0,0,0,0,0,0,0,1700,2000,2200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1500,0,0,0,0,0,7500,0,0,0,0,1200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1400,2200,6000,7200,8000,7800,9100,7200,14400,13000,12100,15600,14300,22400,18000,22400,28900,30600,28500,28000,16800,19800,20700,38400,32500,0,0,0,0,500,700,0,0,0,0,0,0,0,1900,0,0,0,0
201905,0,0,0,0,0,1900,0,0,0,0,0,0,0,100,0,1900,100,0,0,0,0,0,0,0,1100,1300,0,0,0,0,0,2200,0,0,0,0,0,0,600,0,0,1000,1100,0,0,1700,0,0,2000,0,0,0,0,0,0,2300,1900,2600,3900,5200,9000,9000,14000,12000,13500,8000,15400,12000,20800,23800,24000,28800,15300,32400,20900,26000,33600,8800,36800,31200,45000,0,0,300,0,0,0,0,0,0,1200,0,0,0,0,0,0,0,2500
201906,0,0,0,0,0,0,2300,0,0,0,0,0,0,0,0,0,0,300,0,0,0,700,0,0,0,0,0,1500,0,0,2100,0,0,0,0,100,0,0,0,800,0,0,0,0,0,0,0,0,0,0,0,0,700,0,0,0,2000,2400,2400,7200,6500,9600,7700,8800,12600,20000,15400,14400,19500,15400,19500,24000,23800,27000,34200,22000,29400,33000,25300,24000,30000,0,0,0,0,0,0,900,0,0,0,0,1600,0,0,0,0,2400,0
201907,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1700,0,0,0,0,500,0,0,800,0,0,0,0,1500,0,0,0,2200,0,0,0,0,0,500,0,0,0,0,0,0,0,0,0,0,2000,2200,0,0,0,0,0,0,1600,3600,4500,6000,4500,9600,11900,12800,13500,17000,9900,20400,13000,18200,25500,38400,18700,25200,38000,34000,23100,52800,32200,38400,27500,0,0,0,0,0,0,900,0,0,0,1500,0,1800,0,0,0,0,0


### Knock14: 商品名の揺れを補正する

In [100]:
# 商品名のユニーク数の確認

print(len(pd.unique(uriage_data["item_name"])))
# 99

99


In [101]:
# データ揺れの修正

# アルファベットを大文字にする
uriage_data["item_name"] = uriage_data["item_name"].str.upper()

# 全角スペースの削除
uriage_data["item_name"] = uriage_data["item_name"].str.replace("　", "")
# 半角スペースの削除
uriage_data["item_name"] = uriage_data["item_name"].str.replace(" ", "")

# アルファベット順にソート
uriage_data.sort_values(by=["item_name"], ascending=True)

Unnamed: 0,purchase_date,item_name,item_price,customer_name,purchase_month
0,2019-06-13 18:02:34,商品A,100.0000,深井菜々美,201906
1748,2019-05-19 20:22:22,商品A,100.0000,松川綾女,201905
223,2019-06-25 08:13:20,商品A,100.0000,板橋隆,201906
1742,2019-06-13 16:03:17,商品A,100.0000,小平陽子,201906
1738,2019-02-10 00:28:43,商品A,100.0000,松田浩正,201902
...,...,...,...,...,...
2880,2019-04-22 00:36:52,商品Y,,田辺光洋,201904
2881,2019-04-30 14:21:09,商品Y,,高原充則,201904
1525,2019-01-24 10:27:23,商品Y,2500.0000,五十嵐春樹,201901
1361,2019-05-28 13:45:32,商品Y,2500.0000,大崎ヒカル,201905


In [102]:
# 修正データの確認

print(len(pd.unique(uriage_data["item_name"])))
print(pd.unique(uriage_data["item_name"]))

26
['商品A' '商品S' '商品Z' '商品V' '商品O' '商品U' '商品L' '商品C' '商品I' '商品R' '商品X' '商品G'
 '商品P' '商品Q' '商品Y' '商品N' '商品W' '商品E' '商品K' '商品B' '商品F' '商品D' '商品M' '商品H'
 '商品T' '商品J']


### Knock15: 金額の欠損値の補完

In [103]:
# 欠損値の確認

uriage_data.isnull().any(axis=0)

purchase_date     False
item_name         False
item_price         True
customer_name     False
purchase_month    False
dtype: bool

In [104]:
# 金額の欠損値の補完

# item_priceの欠損している箇所の特定
flg_is_null = uriage_data["item_price"].isnull()

# 「金額が欠損している」、「商品名」をユニークに抽出する
for trg in list(uriage_data.loc[flg_is_null, "item_name"].unique()):
    # 「欠損値がある商品名」を用いて、
    # 同じ商品名で金額が正しく記載されている行から、
    # 金額を取得する
    price = uriage_data.loc[
        (~flg_is_null) & (uriage_data["item_name"] == trg), "item_price"
    ].max()
    # 取得した金額で欠損データを補完する
    uriage_data.loc[
        (flg_is_null) & (uriage_data["item_name"] == trg), "item_price"
    ] = price

display(uriage_data.head())
display(uriage_data.tail())

Unnamed: 0,purchase_date,item_name,item_price,customer_name,purchase_month
0,2019-06-13 18:02:34,商品A,100.0,深井菜々美,201906
1,2019-07-13 13:05:29,商品S,1900.0,浅田賢二,201907
2,2019-05-11 19:42:07,商品A,100.0,南部慶二,201905
3,2019-02-12 23:40:45,商品Z,2600.0,麻生莉緒,201902
4,2019-04-22 03:09:35,商品A,100.0,平田鉄二,201904


Unnamed: 0,purchase_date,item_name,item_price,customer_name,purchase_month
2994,2019-02-15 02:56:39,商品Y,2500.0,福島友也,201902
2995,2019-06-22 04:03:43,商品M,1300.0,大倉晃司,201906
2996,2019-03-29 11:14:05,商品Q,1700.0,尾形小雁,201903
2997,2019-07-14 12:56:49,商品H,800.0,芦田博之,201907
2998,2019-07-21 00:31:36,商品D,400.0,石田郁恵,201907


In [105]:
# 欠損値の確認

uriage_data.isnull().any(axis=0)

purchase_date     False
item_name         False
item_price        False
customer_name     False
purchase_month    False
dtype: bool

In [106]:
# 各商品の金額が正しく保管されたか確認

for trg in list(uriage_data["item_name"].sort_values().unique()):
    print(
        trg 
        + "の最大額: " + str(uriage_data.loc[uriage_data["item_name"] == trg]["item_price"].max()) 
        + " 最小値: " + str(uriage_data.loc[uriage_data["item_name"] == trg]["item_price"].min(skipna=False))
    )

商品Aの最大額: 100.0 最小値: 100.0
商品Bの最大額: 200.0 最小値: 200.0
商品Cの最大額: 300.0 最小値: 300.0
商品Dの最大額: 400.0 最小値: 400.0
商品Eの最大額: 500.0 最小値: 500.0
商品Fの最大額: 600.0 最小値: 600.0
商品Gの最大額: 700.0 最小値: 700.0
商品Hの最大額: 800.0 最小値: 800.0
商品Iの最大額: 900.0 最小値: 900.0
商品Jの最大額: 1000.0 最小値: 1000.0
商品Kの最大額: 1100.0 最小値: 1100.0
商品Lの最大額: 1200.0 最小値: 1200.0
商品Mの最大額: 1300.0 最小値: 1300.0
商品Nの最大額: 1400.0 最小値: 1400.0
商品Oの最大額: 1500.0 最小値: 1500.0
商品Pの最大額: 1600.0 最小値: 1600.0
商品Qの最大額: 1700.0 最小値: 1700.0
商品Rの最大額: 1800.0 最小値: 1800.0
商品Sの最大額: 1900.0 最小値: 1900.0
商品Tの最大額: 2000.0 最小値: 2000.0
商品Uの最大額: 2100.0 最小値: 2100.0
商品Vの最大額: 2200.0 最小値: 2200.0
商品Wの最大額: 2300.0 最小値: 2300.0
商品Xの最大額: 2400.0 最小値: 2400.0
商品Yの最大額: 2500.0 最小値: 2500.0
商品Zの最大額: 2600.0 最小値: 2600.0


### Knock16: 顧客名の揺れを補正

In [107]:
# 顧客台帳の顧客名の確認

display(kokyaku_data["顧客名"].head())
display(kokyaku_data["顧客名"].tail())

0     須賀ひとみ
1    岡田　 敏也
2      芳賀 希
3     荻野  愛
4     栗田 憲一
Name: 顧客名, dtype: object

195     川上 りえ
196     小松 季衣
197     白鳥 りえ
198    大西 隆之介
199     福井 美希
Name: 顧客名, dtype: object

In [108]:
# 売上台帳の顧客名の確認

display(uriage_data["customer_name"].head())
display(uriage_data["customer_name"].tail())

0    深井菜々美
1     浅田賢二
2     南部慶二
3     麻生莉緒
4     平田鉄二
Name: customer_name, dtype: object

2994    福島友也
2995    大倉晃司
2996    尾形小雁
2997    芦田博之
2998    石田郁恵
Name: customer_name, dtype: object

In [109]:
# 顧客台帳の顧客名のスペースの削除

# 全角スペースの削除
kokyaku_data["顧客名"] = kokyaku_data["顧客名"].str.replace("　", "")
# 半角スペースの削除
kokyaku_data["顧客名"] = kokyaku_data["顧客名"].str.replace(" ", "")

display(kokyaku_data["顧客名"].head())
display(kokyaku_data["顧客名"].tail())

0    須賀ひとみ
1     岡田敏也
2      芳賀希
3      荻野愛
4     栗田憲一
Name: 顧客名, dtype: object

195     川上りえ
196     小松季衣
197     白鳥りえ
198    大西隆之介
199     福井美希
Name: 顧客名, dtype: object

### Knock17: 日付の揺れを補正

In [110]:
# 日付を統一フォーマットに補正

# 数値となっている箇所の特定
# 「42782」のような「数値」として取り込まれているデータの特定
flg_is_serial = kokyaku_data["登録日"].astype("str").str.isdigit()
# 対象件数の確認
flg_is_serial.sum()

22

In [111]:
# 数値から日付に変換する

fromSerial = pd.to_timedelta(
    kokyaku_data.loc[flg_is_serial, "登録日"].astype("float") - 2, 
    unit="D"
) + pd.to_datetime("1900/1/1")
fromSerial

1     2017-02-16
3     2017-05-17
4     2018-01-27
21    2017-07-04
27    2017-06-15
47    2017-01-06
49    2017-07-13
53    2017-04-08
76    2018-03-29
80    2018-01-10
99    2017-05-30
114   2018-06-03
118   2018-01-29
122   2018-04-16
139   2017-05-25
143   2017-03-24
155   2017-01-19
172   2018-03-22
179   2017-01-08
183   2017-07-24
186   2018-07-13
192   2018-06-08
Name: 登録日, dtype: datetime64[ns]

In [112]:
# 日付として取り込まれている対象の書式変更
# ハイフン区切りに統一する

fromString = pd.to_datetime(kokyaku_data.loc[~flg_is_serial, "登録日"])
fromString

0     2018-01-04
2     2018-01-07
5     2017-06-20
6     2018-06-11
7     2017-05-19
         ...    
195   2017-06-20
196   2018-06-20
197   2017-04-29
198   2019-04-19
199   2019-04-23
Name: 登録日, Length: 178, dtype: datetime64[ns]

In [113]:
# 日付データの更新

kokyaku_data["登録日"] = pd.concat([fromSerial, fromString])
display(kokyaku_data)

Unnamed: 0,顧客名,かな,地域,メールアドレス,登録日
0,須賀ひとみ,すが ひとみ,H市,suga_hitomi@example.com,2018-01-04
1,岡田敏也,おかだ としや,E市,okada_toshiya@example.com,2017-02-16
2,芳賀希,はが のぞみ,A市,haga_nozomi@example.com,2018-01-07
3,荻野愛,おぎの あい,F市,ogino_ai@example.com,2017-05-17
4,栗田憲一,くりた けんいち,E市,kurita_kenichi@example.com,2018-01-27
...,...,...,...,...,...
195,川上りえ,かわかみ りえ,G市,kawakami_rie@example.com,2017-06-20
196,小松季衣,こまつ としえ,E市,komatsu_toshie@example.com,2018-06-20
197,白鳥りえ,しらとり りえ,F市,shiratori_rie@example.com,2017-04-29
198,大西隆之介,おおにし りゅうのすけ,H市,oonishi_ryuunosuke@example.com,2019-04-19


In [114]:
# 登録日から登録月を算出し、集計する

kokyaku_data["登録年月"] = kokyaku_data["登録日"].dt.strftime("%Y%m")

# 月別の登録件数
result = kokyaku_data.groupby("登録年月").count()["顧客名"]
print(result)
print(result.sum())
print(kokyaku_data.shape)

登録年月
201701    15
201702    11
201703    14
201704    15
201705    14
201706    13
201707    17
201801    13
201802    15
201803    17
201804     5
201805    19
201806    13
201807    17
201904     2
Name: 顧客名, dtype: int64
200
(200, 6)


In [115]:
# 数値項目の有無

flg_is_serial = kokyaku_data["登録日"].astype("str").str.isdigit()
flg_is_serial.sum()

0

### Knock18: 顧客名をキーに2つのデータを結合（ジョイン）する

In [116]:
# uriage_data["customer_name"]と
# kokyaku_data["顧客名"]を利用して
# 結合（ジョイン）する

join_data = pd.merge(
    uriage_data, kokyaku_data,
    left_on="customer_name", right_on="顧客名",
    how="left"
)
# customer_name列の削除
join_data = join_data.drop("customer_name", axis=1)

display(join_data)

Unnamed: 0,purchase_date,item_name,item_price,purchase_month,顧客名,かな,地域,メールアドレス,登録日,登録年月
0,2019-06-13 18:02:34,商品A,100.0000,201906,深井菜々美,ふかい ななみ,C市,fukai_nanami@example.com,2017-01-26,201701
1,2019-07-13 13:05:29,商品S,1900.0000,201907,浅田賢二,あさだ けんじ,C市,asada_kenji@example.com,2018-04-07,201804
2,2019-05-11 19:42:07,商品A,100.0000,201905,南部慶二,なんぶ けいじ,A市,nannbu_keiji@example.com,2018-06-19,201806
3,2019-02-12 23:40:45,商品Z,2600.0000,201902,麻生莉緒,あそう りお,D市,asou_rio@example.com,2018-07-22,201807
4,2019-04-22 03:09:35,商品A,100.0000,201904,平田鉄二,ひらた てつじ,D市,hirata_tetsuji@example.com,2017-06-07,201706
...,...,...,...,...,...,...,...,...,...,...
2994,2019-02-15 02:56:39,商品Y,2500.0000,201902,福島友也,ふくしま ともや,B市,fukushima_tomoya@example.com,2017-07-01,201707
2995,2019-06-22 04:03:43,商品M,1300.0000,201906,大倉晃司,おおくら こうじ,E市,ookura_kouji@example.com,2018-03-31,201803
2996,2019-03-29 11:14:05,商品Q,1700.0000,201903,尾形小雁,おがた こがん,B市,ogata_kogan@example.com,2017-03-15,201703
2997,2019-07-14 12:56:49,商品H,800.0000,201907,芦田博之,あしだ ひろゆき,E市,ashida_hiroyuki@example.com,2018-07-13,201807


### Knock19: クレンジングしたデータを出力（ダンプ）する

In [117]:
# 項目名は並び順の調整

dump_data = join_data[[
    "purchase_date", "purchase_month",
    "item_name", "item_price",
    "顧客名", "かな", "地域",
    "メールアドレス", "登録日"
]]
display(dump_data)

Unnamed: 0,purchase_date,purchase_month,item_name,item_price,顧客名,かな,地域,メールアドレス,登録日
0,2019-06-13 18:02:34,201906,商品A,100.0000,深井菜々美,ふかい ななみ,C市,fukai_nanami@example.com,2017-01-26
1,2019-07-13 13:05:29,201907,商品S,1900.0000,浅田賢二,あさだ けんじ,C市,asada_kenji@example.com,2018-04-07
2,2019-05-11 19:42:07,201905,商品A,100.0000,南部慶二,なんぶ けいじ,A市,nannbu_keiji@example.com,2018-06-19
3,2019-02-12 23:40:45,201902,商品Z,2600.0000,麻生莉緒,あそう りお,D市,asou_rio@example.com,2018-07-22
4,2019-04-22 03:09:35,201904,商品A,100.0000,平田鉄二,ひらた てつじ,D市,hirata_tetsuji@example.com,2017-06-07
...,...,...,...,...,...,...,...,...,...
2994,2019-02-15 02:56:39,201902,商品Y,2500.0000,福島友也,ふくしま ともや,B市,fukushima_tomoya@example.com,2017-07-01
2995,2019-06-22 04:03:43,201906,商品M,1300.0000,大倉晃司,おおくら こうじ,E市,ookura_kouji@example.com,2018-03-31
2996,2019-03-29 11:14:05,201903,商品Q,1700.0000,尾形小雁,おがた こがん,B市,ogata_kogan@example.com,2017-03-15
2997,2019-07-14 12:56:49,201907,商品H,800.0000,芦田博之,あしだ ひろゆき,E市,ashida_hiroyuki@example.com,2018-07-13


In [118]:
# csvファイルとして出力する

dump_data.to_csv("dump_data.csv", index=False)

### Knock20: データを集計する

In [119]:
# ダンプファイルの読み込み

import_data = pd.read_csv("dump_data.csv")
display(import_data)

Unnamed: 0,purchase_date,purchase_month,item_name,item_price,顧客名,かな,地域,メールアドレス,登録日
0,2019-06-13 18:02:34,201906,商品A,100.0000,深井菜々美,ふかい ななみ,C市,fukai_nanami@example.com,2017-01-26
1,2019-07-13 13:05:29,201907,商品S,1900.0000,浅田賢二,あさだ けんじ,C市,asada_kenji@example.com,2018-04-07
2,2019-05-11 19:42:07,201905,商品A,100.0000,南部慶二,なんぶ けいじ,A市,nannbu_keiji@example.com,2018-06-19
3,2019-02-12 23:40:45,201902,商品Z,2600.0000,麻生莉緒,あそう りお,D市,asou_rio@example.com,2018-07-22
4,2019-04-22 03:09:35,201904,商品A,100.0000,平田鉄二,ひらた てつじ,D市,hirata_tetsuji@example.com,2017-06-07
...,...,...,...,...,...,...,...,...,...
2994,2019-02-15 02:56:39,201902,商品Y,2500.0000,福島友也,ふくしま ともや,B市,fukushima_tomoya@example.com,2017-07-01
2995,2019-06-22 04:03:43,201906,商品M,1300.0000,大倉晃司,おおくら こうじ,E市,ookura_kouji@example.com,2018-03-31
2996,2019-03-29 11:14:05,201903,商品Q,1700.0000,尾形小雁,おがた こがん,B市,ogata_kogan@example.com,2017-03-15
2997,2019-07-14 12:56:49,201907,商品H,800.0000,芦田博之,あしだ ひろゆき,E市,ashida_hiroyuki@example.com,2018-07-13


In [123]:
# 月別の各商品数量

byItem = import_data.pivot_table(
    index="purchase_month",
    columns="item_name",
    aggfunc="size",
    fill_value=0
)
print(byItem.shape)
display(byItem)

(7, 26)


item_name,商品A,商品B,商品C,商品D,商品E,商品F,商品G,商品H,商品I,商品J,商品K,商品L,商品M,商品N,商品O,商品P,商品Q,商品R,商品S,商品T,商品U,商品V,商品W,商品X,商品Y,商品Z
purchase_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
201901,18,13,19,17,18,15,11,16,18,17,20,19,19,16,24,16,17,21,20,17,7,22,13,14,10,0
201902,19,14,26,21,16,14,14,17,12,14,16,11,15,20,19,19,22,22,22,23,19,22,24,16,11,1
201903,17,21,20,17,9,27,14,18,12,16,14,20,22,13,11,21,23,16,20,12,23,18,16,21,16,0
201904,17,19,24,20,18,17,14,11,18,13,14,15,11,20,15,15,20,20,16,16,11,15,14,16,20,0
201905,24,14,16,14,19,18,23,15,16,11,18,13,18,19,18,20,13,22,18,16,16,9,21,16,20,0
201906,24,12,11,19,13,18,15,13,19,22,15,15,17,16,15,18,15,16,21,12,18,20,17,15,13,0
201907,20,20,17,17,12,17,19,19,19,23,12,17,11,15,22,26,15,19,23,21,13,28,16,18,12,0


In [125]:
# 月別の各商品の売上金額

byPrice = import_data.pivot_table(
    index="purchase_month",
    columns="item_name",
    values="item_price",
    aggfunc="sum",
    fill_value=0
)
print(byPrice.shape)
display(byPrice)

(7, 26)


item_name,商品A,商品B,商品C,商品D,商品E,商品F,商品G,商品H,商品I,商品J,商品K,商品L,商品M,商品N,商品O,商品P,商品Q,商品R,商品S,商品T,商品U,商品V,商品W,商品X,商品Y,商品Z
purchase_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
201901,1800,2600,5700,6800,9000,9000,7700,12800,16200,17000,22000,22800,24700,22400,36000,25600,28900,37800,38000,34000,14700,48400,29900,33600,25000,0
201902,1900,2800,7800,8400,8000,8400,9800,13600,10800,14000,17600,13200,19500,28000,28500,30400,37400,39600,41800,46000,39900,48400,55200,38400,27500,2600
201903,1700,4200,6000,6800,4500,16200,9800,14400,10800,16000,15400,24000,28600,18200,16500,33600,39100,28800,38000,24000,48300,39600,36800,50400,40000,0
201904,1700,3800,7200,8000,9000,10200,9800,8800,16200,13000,15400,18000,14300,28000,22500,24000,34000,36000,30400,32000,23100,33000,32200,38400,50000,0
201905,2400,2800,4800,5600,9500,10800,16100,12000,14400,11000,19800,15600,23400,26600,27000,32000,22100,39600,34200,32000,33600,19800,48300,38400,50000,0
201906,2400,2400,3300,7600,6500,10800,10500,10400,17100,22000,16500,18000,22100,22400,22500,28800,25500,28800,39900,24000,37800,44000,39100,36000,32500,0
201907,2000,4000,5100,6800,6000,10200,13300,15200,17100,23000,13200,20400,14300,21000,33000,41600,25500,34200,43700,42000,27300,61600,36800,43200,30000,0


In [126]:
# 月別の各顧客の購入数

byCustomer = import_data.pivot_table(
    index="purchase_month",
    columns="顧客名",
    aggfunc="size",
    fill_value=0
)
print(byCustomer.shape)
display(byCustomer)

(7, 199)


顧客名,さだ千佳子,中仁晶,中田美智子,丸山光臣,久保田倫子,亀井一徳,五十嵐春樹,井上桃子,井口寛治,井川真悠子,井川里穂,井本マサカズ,井村俊二,今茜,佐藤慶二,八木雅彦,内村まさみ,内田聡,南部慶二,原口俊二,古川信吾,合田光,吉岡サダヲ,吉村愛梨,和泉直人,唐沢景子,唐沢涼,土屋朝陽,城戸芳正,堀サンタマリア,堀内聡,堀北雅彦,堀江佑,外山広司,大倉晃司,大地礼子,大城ケンイチ,大山咲,大崎ヒカル,大滝麗奈,大西隆之介,奥光洋,宇野秀樹,小口豊,小川美菜,小平陽子,小松季衣,小松隼士,小栗正義,小町瞬,尾上勝久,尾形小雁,山口法子,山西花,岡慶太,岡村希,岡田敏也,岩井莉緒,岩佐孝太郎,岩城徹平,岩沢那奈,島孝太郎,島崎礼子,島本研二,島英嗣,島袋友以乃,川上りえ,川島友以乃,市田寿明,平田鉄二,平賀一哉,影山輝信,徳重優,志村サダヲ,戸塚美幸,手塚進,手塚雅之,新村丈史,新村美月,新谷智花,日比野徹,日野夏希,明石家明,星野美嘉,有馬徹平,望月真悠子,本多フミヤ,本橋直人,杉下悟志,杉田将也,村山知世,東光博,松元翔太,松居満,松岡ノブヒコ,松川綾女,松村聡,松沢育二,松田浩正,松谷愛子,板橋隆,林勇,栗田憲一,根岸仁晶,根岸莉央,根本博明,桑原桃子,梅村秀樹,梅沢麻緒,梅津淳子,森岡季衣,植木沙知絵,植村遥,楠哲平,楠高史,榊原しぼり,榎本薫,横田遥,水野メイサ,沖遥,河内さとみ,河村由樹,浅田賢二,浅見広司,浜田未華子,深井照生,深井菜々美,深沢ひろ子,深田信輔,清水佑,清水裕次郎,熊井憲史,熊倉明日,熊倉綾,片瀬長利,牧田玲那,田上美佐子,田崎菜々美,田畑正敏,田辺きみまろ,田辺光洋,白井俊二,白鳥りえ,相原ひとり,相川良介,矢沢恵梨香,矢部夏空,矢部惇,矢部美幸,石崎幸子,石川まさみ,石渡小雁,石田佑,石田花,石田郁恵,石野仁,磯野希,神原美嘉,福島友也,福本美幸,秋葉あき,稲田将也,立石茜,笹原しぼり,笹川照生,篠山雅功,米沢仁晶,綾瀬俊介,芦田博之,芳賀希,若杉徹,荻野愛,荻野愛菜,菅原誠治,藤広之,藤木一恵,西原未華子,西島知世,西脇礼子,谷本愛梨,赤木だん吉,赤木愛梨,進藤瞬,那須蒼甫,野本仁晶,金森なつみ,鈴木一哉,青山鉄洋,須賀ひとみ,香椎優一,高原充則,高梨結衣,高沢美咲,高田さんま,鳥居広司,鶴岡薫,麻生莉緒,黄川田博之,黒谷長利
purchase_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1
201901,3,1,4,2,2,0,5,3,3,1,4,1,4,3,4,4,2,1,0,4,2,1,1,2,3,1,0,1,2,2,3,1,4,6,0,4,1,3,2,3,0,2,2,2,3,5,0,5,1,2,3,2,4,4,3,1,2,0,1,2,1,0,1,2,2,5,2,6,3,1,3,0,0,4,5,1,3,1,2,3,1,3,0,3,4,1,3,1,3,4,2,2,1,1,1,1,3,3,2,3,3,1,2,0,1,2,0,2,2,2,0,1,1,1,2,2,2,1,0,2,2,2,3,0,4,3,1,1,3,2,1,0,1,2,1,4,3,2,1,1,2,2,3,1,1,1,1,2,1,3,4,1,3,3,3,3,2,0,4,1,1,2,4,0,1,1,4,4,2,1,2,3,2,4,4,0,7,1,0,2,5,3,2,2,2,2,4,2,2,0,1,1,1,5,2,0,2,2,5
201902,9,1,2,2,1,4,2,1,0,4,2,3,1,1,3,2,3,2,4,2,1,2,0,1,5,1,1,3,1,3,3,1,3,1,2,2,1,1,1,1,1,3,0,2,1,3,6,1,4,3,2,5,2,2,5,5,4,4,4,1,0,0,0,3,3,1,5,2,3,4,2,2,3,1,5,1,5,1,0,1,5,2,0,0,1,1,2,4,0,0,2,4,5,4,1,4,3,2,3,2,1,4,3,2,1,0,3,1,4,1,3,1,1,2,1,1,3,0,4,3,5,0,2,3,3,3,3,1,0,2,7,4,1,1,3,5,3,2,4,2,0,4,0,2,3,2,0,1,2,3,5,5,3,2,2,5,3,4,2,2,5,2,1,0,1,2,2,3,2,1,1,2,2,7,1,1,2,1,1,1,3,3,2,1,2,4,1,4,3,4,0,3,2,0,1,2,4,0,1
201903,1,2,1,6,1,4,3,3,2,2,1,2,0,3,2,2,3,2,1,5,1,4,2,2,0,3,4,1,0,1,5,2,5,0,1,2,1,3,2,3,0,2,0,4,0,1,0,2,2,0,2,3,3,1,2,5,3,2,2,0,1,1,4,2,3,3,2,2,2,3,0,5,1,3,5,1,2,2,4,2,2,1,4,0,3,0,1,1,1,2,1,5,2,2,0,2,5,6,2,1,0,5,5,2,4,2,0,2,0,2,2,0,3,2,3,5,2,3,0,4,5,1,2,3,4,2,4,2,5,2,1,3,2,2,4,1,4,2,2,1,1,3,2,1,2,3,4,2,3,2,4,0,2,0,4,3,3,3,2,2,2,1,1,2,5,5,1,2,1,1,0,1,2,1,5,3,1,3,2,1,3,1,1,5,2,0,1,0,2,3,1,6,2,4,2,4,2,2,1
201904,0,3,1,2,0,2,2,0,3,2,2,1,4,1,3,1,1,4,5,3,1,2,1,3,4,4,4,6,0,0,1,4,3,1,2,0,2,2,2,1,2,4,1,1,2,2,3,2,2,1,3,4,3,0,1,0,3,3,1,1,1,2,0,3,5,5,3,3,1,3,1,1,2,2,4,2,1,2,1,2,3,1,3,0,1,3,1,1,3,4,1,2,2,1,1,2,4,0,5,3,2,4,1,4,1,1,4,4,0,5,0,1,5,0,6,1,0,3,1,1,5,2,4,4,3,0,1,2,1,4,3,1,2,1,2,5,1,1,2,0,4,2,1,0,4,0,3,1,4,2,0,0,3,1,1,0,1,1,1,2,2,0,2,1,5,2,4,2,1,2,2,4,2,3,1,1,1,4,0,1,1,3,4,3,0,1,3,2,3,2,4,2,3,4,3,2,1,2,0
201905,3,2,5,2,4,1,2,1,3,3,0,3,1,1,1,3,2,1,2,1,3,1,0,6,0,0,4,3,0,2,1,1,3,0,1,0,1,2,4,4,0,2,1,1,3,1,2,1,1,1,3,2,1,2,3,3,2,2,2,2,3,5,2,1,6,3,0,0,2,2,3,2,3,4,1,3,4,0,5,0,5,6,1,4,3,0,5,4,0,0,2,3,3,2,3,5,2,4,5,1,0,2,2,3,2,4,2,2,1,4,1,2,1,2,0,2,5,1,2,1,1,1,3,1,1,3,0,1,1,1,1,3,1,1,2,2,3,1,3,1,0,2,3,1,4,3,3,2,6,1,6,2,4,1,2,2,3,5,4,6,4,0,2,1,1,2,1,4,2,1,2,4,1,1,0,5,1,2,1,4,2,4,2,0,1,1,6,2,2,1,1,1,0,2,2,3,4,4,1
201906,1,3,0,4,1,1,1,2,2,3,2,1,1,1,2,4,0,5,2,0,1,0,2,3,1,2,3,1,0,0,2,4,4,2,3,3,4,4,1,2,1,4,2,1,2,2,2,0,4,2,1,1,0,6,1,1,4,2,4,2,4,2,1,2,0,2,1,3,3,3,1,2,1,2,1,1,1,2,3,3,3,0,1,4,2,0,5,2,3,0,1,2,0,3,2,2,3,2,0,2,3,1,1,3,3,0,6,2,1,4,1,2,2,2,3,1,4,3,1,4,5,3,2,1,5,0,5,4,2,1,2,5,3,1,1,0,0,2,2,2,1,1,3,2,0,0,1,4,1,2,3,0,2,0,2,2,2,2,2,6,0,2,1,3,1,3,2,4,1,1,4,3,2,1,5,2,0,1,2,5,1,3,3,1,2,0,3,3,4,7,3,0,2,1,0,2,1,2,4
201907,3,0,3,2,5,3,5,2,5,5,6,2,0,2,1,1,3,3,0,1,2,2,2,1,2,2,2,1,1,2,0,1,1,2,1,2,1,1,1,1,3,1,4,0,3,2,2,3,1,1,1,2,2,2,0,3,1,1,6,3,0,4,0,0,0,7,3,2,4,3,3,2,5,4,3,4,1,2,5,2,5,2,2,4,4,2,0,0,5,1,3,3,1,0,3,1,1,1,2,2,4,3,5,4,0,4,4,2,5,1,4,2,2,2,1,1,3,4,1,1,1,0,2,1,2,6,3,2,2,2,1,1,4,1,1,2,2,3,1,2,5,1,3,2,4,1,6,0,9,1,3,4,2,3,3,2,0,1,3,4,3,3,1,1,1,2,2,0,2,3,4,2,1,0,1,2,1,1,4,3,4,2,3,4,3,2,0,0,4,2,4,4,2,0,2,4,3,4,1


In [127]:
# 月別の各地域の販売数

byRegion = import_data.pivot_table(
    index="purchase_month",
    columns="地域",
    aggfunc="size",
    fill_value=0
)
print(byRegion.shape)
display(byRegion)

(7, 8)


地域,A市,B市,C市,D市,E市,F市,G市,H市
purchase_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201901,59,55,72,34,49,57,49,42
201902,71,46,65,48,61,52,43,63
201903,64,52,57,43,52,59,51,59
201904,64,48,54,45,48,58,40,52
201905,57,52,68,48,59,65,35,43
201906,53,47,61,30,51,51,58,58
201907,76,53,61,42,54,64,47,54


In [128]:
# 未購入ユーザーの有無

away_data = pd.merge(
    uriage_data, kokyaku_data,
    left_on="customer_name", right_on="顧客名",
    how="right"
)
away_data[away_data["purchase_date"].isnull()][["顧客名", "メールアドレス", "登録日"]]

Unnamed: 0,顧客名,メールアドレス,登録日
2999,福井美希,fukui_miki1@example.com,2019-04-23
