In [1]:
# 事前にGCPの認証が必要で、認証方法は環境によって異なる
# colabの場合:
#   セルで下記を実行
#     from google.colab import auth
#     auth.authenticate_user()
#     %env GCLOUD_PROJECT=GCPのプロジェクトID
# PC等のローカル環境の場合:
#   初回のみ、https://cloud.google.com/sdk/docs/install-sdk からgcloud CLIをインストールし、gcloud initを実行
%load_ext google.cloud.bigquery

# 7章 結合


## 7-1 1対1または多対1の関係のテーブルの結合
### Q: ビジネスホテルかつ宿泊人数が1名の予約履歴の抽出
#### Not Awesome

In [2]:
%%bigquery
with
--（1）reservationとhotelをhotel_id列をキーとして内部結合
reservation_and_hotel as (
    select *
    from example.reservation
    inner join example.hotel using (hotel_id)
)

select *
from reservation_and_hotel
where
    --（2）結合したデータからビジネスホテルかつ宿泊1名の行を抽出
    hotel_type = "ビジネスホテル"
    and people_num = 1

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,hotel_id,reservation_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,...,tag_021,tag_022,tag_023,tag_024,tag_025,tag_026,tag_027,tag_028,tag_029,tag_030
0,2706,72,100573,2014-01-08 05:32:58+00:00,2014-12-31 15:00:00+00:00,2015-01-01 15:00:00+00:00,1,1,8400,reserved,...,0,0,0,0,0,0,0,0,0,0
1,1801,190,4819,2014-01-14 04:47:22+00:00,2014-12-31 15:00:00+00:00,2015-01-01 15:00:00+00:00,1,1,5900,reserved,...,0,0,0,0,0,,0,,0,0
2,3422,191,53534,2014-01-14 05:22:49+00:00,2015-01-03 15:00:00+00:00,2015-01-04 15:00:00+00:00,1,1,8400,reserved,...,0,0,1,0,0,0,1,0,0,
3,521,194,138201,2014-01-14 06:39:34+00:00,2015-01-06 15:00:00+00:00,2015-01-07 15:00:00+00:00,1,1,12100,reserved,...,0,0,0,0,1,0,0,0,0,0
4,3062,289,25934,2014-01-17 23:57:06+00:00,2015-01-05 15:00:00+00:00,2015-01-06 15:00:00+00:00,1,1,13600,reserved,...,0,0,0,,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101023,397,1997854,370872,2019-11-23 20:16:13+00:00,2019-12-22 15:00:00+00:00,2019-12-25 15:00:00+00:00,3,1,36600,reserved,...,0,0,0,0,0,0,0,0,0,0
101024,845,1997960,35905,2019-11-24 13:59:56+00:00,2019-12-20 15:00:00+00:00,2019-12-23 15:00:00+00:00,3,1,32100,reserved,...,0,0,0,0,1,,0,0,,0
101025,3152,1998074,161523,2019-11-25 12:40:21+00:00,2019-12-14 15:00:00+00:00,2019-12-17 15:00:00+00:00,3,1,84000,reserved,...,0,0,0,1,0,,0,0,0,0
101026,3504,1999436,162920,2019-12-10 14:16:07+00:00,2019-12-24 15:00:00+00:00,2019-12-27 15:00:00+00:00,3,1,36600,reserved,...,0,0,0,0,,,0,1,0,0


#### Awesome

In [3]:
%%bigquery
with
--（1）hotelからビジネスホテルの行のみ抽出
target_hotel as (
    select hotel_id
    from example.hotel
    where hotel_type = "ビジネスホテル" -- ビジネスホテルのデータのみ抽出
)

select *
from example.reservation
--（2）ビジネスホテルのみのマスタを内部結合
inner join target_hotel using (hotel_id)
--（3）people_numが1人のデータのみ抽出
where people_num = 1

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,hotel_id,reservation_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at
0,521,194,138201,2014-01-14 06:39:34+00:00,2015-01-06 15:00:00+00:00,2015-01-07 15:00:00+00:00,1,1,12100,reserved,NaT
1,1270,508,191782,2014-01-23 11:50:00+00:00,2015-01-17 15:00:00+00:00,2015-01-18 15:00:00+00:00,1,1,13300,reserved,NaT
2,1636,599,282585,2014-01-25 14:16:27+00:00,2015-01-02 15:00:00+00:00,2015-01-03 15:00:00+00:00,1,1,16500,reserved,NaT
3,811,1093,186407,2014-02-03 03:29:08+00:00,2015-01-19 15:00:00+00:00,2015-01-20 15:00:00+00:00,1,1,12100,reserved,NaT
4,2931,1236,431908,2014-02-05 07:09:44+00:00,2015-01-05 15:00:00+00:00,2015-01-06 15:00:00+00:00,1,1,10700,reserved,NaT
...,...,...,...,...,...,...,...,...,...,...,...
101023,910,1987611,183166,2019-09-25 02:55:05+00:00,2019-10-18 15:00:00+00:00,2019-10-19 15:00:00+00:00,1,1,8700,reserved,NaT
101024,606,1992562,468850,2019-10-19 11:10:10+00:00,2019-12-22 15:00:00+00:00,2019-12-23 15:00:00+00:00,1,1,8700,reserved,NaT
101025,646,1994008,126081,2019-10-27 12:28:20+00:00,2019-12-26 15:00:00+00:00,2019-12-27 15:00:00+00:00,1,1,8700,reserved,NaT
101026,3141,1994656,262424,2019-10-31 13:07:37+00:00,2019-11-17 15:00:00+00:00,2019-11-18 15:00:00+00:00,1,1,8700,reserved,NaT


#### Awesome

In [4]:
%%bigquery
with
--（1）hotelからビジネスホテルの行のみ抽出
target_hotel as (
    select hotel_id
    from example.hotel
    where hotel_type = "ビジネスホテル" -- ビジネスホテルのデータのみ抽出
)

select
    reservation_id,
    customer_id,
    reserved_at,
    checkin_date,
    checkout_date,
    length_of_stay,
    people_num,
    total_price,
    status,
    canceled_at,
    reservation.hotel_id
from example.reservation
--（2）ビジネスホテルのみのマスタを内部結合
inner join target_hotel on reservation.hotel_id = target_hotel.hotel_id
--（3）people_numが1人のデータのみ抽出
where people_num = 1 -- people_numが1人のデータのみ抽出

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,reservation_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at,hotel_id
0,194,138201,2014-01-14 06:39:34+00:00,2015-01-06 15:00:00+00:00,2015-01-07 15:00:00+00:00,1,1,12100,reserved,NaT,521
1,508,191782,2014-01-23 11:50:00+00:00,2015-01-17 15:00:00+00:00,2015-01-18 15:00:00+00:00,1,1,13300,reserved,NaT,1270
2,599,282585,2014-01-25 14:16:27+00:00,2015-01-02 15:00:00+00:00,2015-01-03 15:00:00+00:00,1,1,16500,reserved,NaT,1636
3,1093,186407,2014-02-03 03:29:08+00:00,2015-01-19 15:00:00+00:00,2015-01-20 15:00:00+00:00,1,1,12100,reserved,NaT,811
4,1236,431908,2014-02-05 07:09:44+00:00,2015-01-05 15:00:00+00:00,2015-01-06 15:00:00+00:00,1,1,10700,reserved,NaT,2931
...,...,...,...,...,...,...,...,...,...,...,...
101023,1987611,183166,2019-09-25 02:55:05+00:00,2019-10-18 15:00:00+00:00,2019-10-19 15:00:00+00:00,1,1,8700,reserved,NaT,910
101024,1992562,468850,2019-10-19 11:10:10+00:00,2019-12-22 15:00:00+00:00,2019-12-23 15:00:00+00:00,1,1,8700,reserved,NaT,606
101025,1994008,126081,2019-10-27 12:28:20+00:00,2019-12-26 15:00:00+00:00,2019-12-27 15:00:00+00:00,1,1,8700,reserved,NaT,646
101026,1994656,262424,2019-10-31 13:07:37+00:00,2019-11-17 15:00:00+00:00,2019-11-18 15:00:00+00:00,1,1,8700,reserved,NaT,3141


## 7-2 1対多の関係のテーブルの結合
### Q: ホテルマスタにホテルの売上と予約数を付与
#### Not Awesome

In [5]:
%%bigquery
select
    hotel_id,
    hotel_name,
    hotel_type,
    address_prefecture,
    address_city,
    address_town,
    address_zipcode,
    unit_price,
    user_rating,
    --（4）total_priceの総和、およびカウントを集計
    sum(total_price) as sales,
    count(*) as reservation_cnt
from example.hotel
--（1）hotel_id列を結合キーとしてreservationを左外部結合
left join example.reservation using (hotel_id)
--（2）未キャンセルかつcheckout_dateの年が2019の行のみ抽出
where
    status != "canceled"
    and format_timestamp("%Y", checkout_date, "Asia/Tokyo") = "2019"
--（3）集計結果以外に結果に残す列を全て結合キーに指定してgroup by
group by
    hotel_id,
    hotel_name,
    hotel_type,
    address_prefecture,
    address_city,
    address_town,
    address_zipcode,
    unit_price,
    user_rating

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,hotel_id,hotel_name,hotel_type,address_prefecture,address_city,address_town,address_zipcode,unit_price,user_rating,sales,reservation_cnt
0,2014,東長島ホテル,リゾートホテル,三重県,北牟婁郡紀北町,東長島,519-3204,6000,1.514291,3474000,142
1,1562,芸濃町萩野民宿,民宿,三重県,津市,芸濃町萩野,514-2214,15400,4.917227,2032800,36
2,3189,小下町ペンション,民宿,三重県,亀山市,小下町,519-0113,9800,4.144539,4811800,126
3,3289,大原ホテル,ビジネスホテル,三重県,北牟婁郡紀北町,大原,519-3202,10200,,469200,14
4,3544,美濃田町民宿,民宿,三重県,松阪市,美濃田町,515-2344,15200,2.554096,7858400,143
...,...,...,...,...,...,...,...,...,...,...,...
4522,1053,久富木温泉旅館,旅館,鹿児島県,薩摩郡さつま町,久富木,895-1722,12900,2.133328,580500,11
4523,3058,名瀬有屋旅館,旅館,鹿児島県,奄美市,名瀬有屋,894-0002,4300,,193500,12
4524,2738,山川小川民宿,民宿,鹿児島県,指宿市,山川小川,891-0515,7700,4.986512,1678600,54
4525,2515,郡元温泉ホテル,リゾートホテル,鹿児島県,鹿児島市,郡元,890-8580,10100,,3636000,91


#### Awesome

In [6]:
%%bigquery
--（1）reservationから未キャンセルかつcheckout_dateの年が2019の行のみ抽出し、
--    hotel_idを集約キーとしてgroup by集計
with reservation_summary as (
    select
        hotel_id,
        sum(total_price) as sales,
        count(*) as reservation_cnt
    from example.reservation
    where
        status != "canceled"
        and format_timestamp("%Y", checkout_date, "Asia/Tokyo") = "2019"
    group by hotel_id
)

select
    --（3）-1 hotelの列を全て抽出
    hotel.*,
    --（3）-2 結合されなかった行の数値を0埋め
    coalesce(sales, 0) as sales,
    coalesce(reservation_cnt, 0) as reservation_cnt
from example.hotel
--（2）hotel_id列を結合キーとしてreservation_summaryを左外部結合
left join reservation_summary using (hotel_id)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,hotel_id,hotel_name,hotel_type,address_prefecture,address_city,address_town,address_zipcode,unit_price,user_rating,tag_001,...,tag_023,tag_024,tag_025,tag_026,tag_027,tag_028,tag_029,tag_030,sales,reservation_cnt
0,1675,北郷五条旅館,旅館,北海道,札幌市白石区,北郷五条,003-0835,5600,3.198398,0,...,0,0,0,0,0,0,0,0,733600,33
1,1929,清田六条温泉旅館,旅館,北海道,札幌市清田区,清田六条,004-0846,12700,3.795193,0,...,0,0,0,0,1,0,,0,2603500,42
2,4852,天王旅館,旅館,秋田県,潟上市,天王,010-0101,6100,3.477558,,...,0,0,,0,1,1,0,0,0,0
3,625,川尻上野町温泉旅館,旅館,秋田県,秋田市,川尻上野町,010-0947,10400,2.156002,0,...,0,0,0,1,0,0,0,0,1206400,30
4,1553,沖鶴旅館,旅館,秋田県,湯沢市,沖鶴,012-0037,6400,1.675748,0,...,0,,0,,0,0,0,0,1216000,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1951,八日町温泉ホテル,リゾートホテル,山形県,上山市,八日町,999-3133,14200,3.342355,0,...,0,0,0,0,0,0,0,0,3379600,66
4996,737,楢下ホテル,リゾートホテル,山形県,上山市,楢下,999-3225,12600,4.851125,0,...,0,0,0,,0,0,0,0,2104200,44
4997,140,細谷ホテル,リゾートホテル,山形県,上山市,細谷,999-3236,12600,4.350231,1,...,1,0,0,0,,0,0,0,4019400,91
4998,2429,溝延温泉ホテル,リゾートホテル,山形県,西村山郡河北町,溝延,999-3522,13400,2.744401,0,...,1,0,,0,0,0,0,0,2224400,47


## 7-3 多対多の関係のテーブルの結合
### Q: 顧客マスタに対して、顧客のホテル種別ごとの予約数を付与


#### Awesome

In [7]:
%%bigquery
with
customer_summary as (
    select
        --（2）customer_idごとにホテル種別ごとのデータ数をカウント
        customer_id,
        count(case when hotel_type = "旅館" then 1 end) as ryokan_cnt,
        count(case when hotel_type = "リゾートホテル" then 1 end) as resort_hotel_cnt,
        count(case when hotel_type = "ビジネスホテル" then 1 end) as business_hotel_cnt,
        count(case when hotel_type = "民宿" then 1 end) as minsyuku_cnt
    from example.reservation
    --（1）reservationにhotelを左外部結合
    left join example.hotel using (hotel_id)
    group by customer_id
)

select
    --（4）-1 customerの全ての列を抽出
    customer.*,
    --（4）-2 結合されなかった行の数値を0埋め
    coalesce(ryokan_cnt, 0) as ryokan_cnt,
    coalesce(resort_hotel_cnt, 0) as resort_hotel_cnt,
    coalesce(business_hotel_cnt, 0) as business_hotel_cnt,
    coalesce(minsyuku_cnt, 0) as minsyuku_cnt
from example.customer
--（3）customerにcustomer_summaryを左外部結合
left join customer_summary using (customer_id)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,customer_id,name,age,sex,address_prefecture,address_city,address_town,address_zipcode,ryokan_cnt,resort_hotel_cnt,business_hotel_cnt,minsyuku_cnt
0,144083,渡辺 淳,90,,北海道,札幌市北区,北十条西,001-0010,0,0,1,0
1,405190,鈴木 充,42,,北海道,札幌市北区,北十条西,001-0010,0,0,0,0
2,58445,佐藤 太一,35,,北海道,札幌市北区,北十一条西,001-0011,2,5,4,2
3,176436,佐藤 知実,99,,北海道,札幌市北区,北十一条西,001-0011,0,1,0,1
4,464358,佐藤 太郎,58,,北海道,札幌市北区,北十二条西,001-0012,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
499995,372772,森 七夏,67,,山形県,飽海郡遊佐町,増穂,999-8434,2,1,1,1
499996,19747,近藤 桃子,26,,山形県,飽海郡遊佐町,岩川,999-8436,5,1,0,1
499997,238905,田中 陽子,36,,山形県,飽海郡遊佐町,吹浦,999-8521,2,1,3,3
499998,241391,後藤 治,100,,山形県,飽海郡遊佐町,直世,999-8525,1,0,2,2


## 7-4 すべての結合の組み合わせの生成
### Q: 顧客ごとの月別の売上を計算（売上のない月も出力）
#### Awesome

In [8]:
%%bigquery
with
month_list as (
    --（2）date型の年月を文字列型に変換
    select format_date("%Y-%m", tmp_month) as month
    --（1）1ヶ月間隔の時系列の配列を生成し、unnestで複数行に展開
    from unnest(
        generate_date_array("2019-01-01", "2019-12-01", interval 1 month)
    ) as tmp_month
),

frame as (
    select
        customer_id,
        month
    from example.customer
    --（3）customerに対して月の時系列をクロス結合し、customer_idごとに全ての月の行を生成
    cross join month_list
),

monthly_customer_summary as (
    select
        --（6）customer_idとmonthごとにtotal_priceの総和を計算
        customer_id,
        month,
        sum(total_price) as sales
    from (
        -- （4）未キャンセルデータを抽出し、checkout_dateを年月の文字列に変換
        select
            customer_id,
            format_timestamp("%Y-%m", checkout_date, "Asia/Tokyo") as month,
            total_price
        from example.reservation
        where status != "canceled"
    )
    --（5）対象期間のデータのみ抽出
    where
        month between "2019-01" and "2019-12"
    group by
        customer_id,
        month
)

select
    customer_id,
    month,
    --（8）結合されなかった行の数値を0埋め
    coalesce(sales, 0) as sales
from frame
--（7）fromeに対してmonthly_customer_summaryを左外部結合
left join monthly_customer_summary using (customer_id, month)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,customer_id,month,sales
0,327208,2019-10,0
1,327208,2019-03,0
2,327208,2019-02,0
3,327208,2019-01,0
4,327208,2019-11,0
...,...,...,...
5999995,238905,2019-04,0
5999996,238905,2019-08,47400
5999997,238905,2019-12,0
5999998,238905,2019-09,0


## 7-5 不等式条件での結合
### Q: 予約履歴データにキャンペーン情報を付与
#### Awesome


In [9]:
%%bigquery
select
    reservation.*,
    campaign.campaign_name
from example.reservation
-- reservationのreserved_atがcampaignのstarts_atとends_atの範囲になる行同士を結合
left join example.campaign on reserved_at between starts_at and ends_at

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,reservation_id,hotel_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at,campaign_name
0,697832,4395,17929,2016-04-03 13:46:02+00:00,2016-11-12 15:00:00+00:00,2016-11-13 15:00:00+00:00,1,1,8100,reserved,NaT,GWキャンペーン
1,718647,973,412587,2016-04-22 16:38:12+00:00,2016-11-19 15:00:00+00:00,2016-11-20 15:00:00+00:00,1,1,18200,reserved,NaT,GWキャンペーン
2,1681531,2531,109999,2018-09-18 08:17:29+00:00,2019-08-04 15:00:00+00:00,2019-08-06 15:00:00+00:00,2,1,17200,canceled,2019-01-13 13:24:39+00:00,オータムキャンペーン
3,1696491,3871,59412,2018-10-01 20:53:38+00:00,2019-05-25 15:00:00+00:00,2019-05-28 15:00:00+00:00,3,1,26400,reserved,NaT,オータムキャンペーン
4,971464,110,388844,2016-12-09 08:45:18+00:00,2017-08-21 15:00:00+00:00,2017-08-22 15:00:00+00:00,1,1,8000,reserved,NaT,冬休みキャンペーン
...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,1247358,775,18355,2017-08-18 03:19:46+00:00,2018-07-11 15:00:00+00:00,2018-07-12 15:00:00+00:00,1,6,46800,reserved,NaT,夏休みキャンペーン
1999996,1249900,4685,268465,2017-08-20 09:30:43+00:00,2017-08-21 15:00:00+00:00,2017-08-22 15:00:00+00:00,1,6,31200,reserved,NaT,夏休みキャンペーン
1999997,1232156,1069,241872,2017-08-04 09:28:48+00:00,2017-10-21 15:00:00+00:00,2017-10-23 15:00:00+00:00,2,6,264000,reserved,NaT,夏休みキャンペーン
1999998,1713321,862,410722,2018-10-17 06:21:33+00:00,2018-11-17 15:00:00+00:00,2018-11-19 15:00:00+00:00,2,6,116400,reserved,NaT,オータムキャンペーン
