In [None]:
# 事前にGCPの認証が必要で、認証方法は環境によって異なる
# colabの場合:
#   セルで下記を実行
#     from google.colab import auth
#     auth.authenticate_user()
#     %env GCLOUD_PROJECT=GCPのプロジェクトID
# PC等のローカル環境の場合:
#   初回のみ、https://cloud.google.com/sdk/docs/install-sdk からgcloud CLIをインストールし、gcloud initを実行
%load_ext google.cloud.bigquery

# 14章 ウィンドウ関数
## 14-1 順序のないウィンドウ関数
### Q: ホテルマスタに対してホテルのある市区町村の平均価格の付与
#### Not Awesome

In [None]:
%%bigquery
with
--（1）address_prefectureとaddress_cityごとにunit_priceの平均を算出
city_summary as (
    select
        address_prefecture,
        address_city,
        avg(unit_price) as avg_price_within_city
    from example.hotel
    group by
        address_prefecture,
        address_city
)

select *
from example.hotel
--（2）hotelにcity_summaryをleft join
left join city_summary using (address_prefecture, address_city)

#### Awesome

In [None]:
%%bigquery
select
    *,
    -- address_prefectureとaddress_cityのグループごとにunit_priceの平均を算出
    avg(unit_price) over (partition by address_prefecture, address_city)
        as avg_price_within_city
from example.hotel

### Q: 都道府県ごとのホテル数の構成比の算出
#### Not Awesome

In [None]:
%%bigquery
with
--（1）全体のホテル数をカウント
summary as (
    select count(*) as total_cnt
    from example.hotel
),

--（2）address_prefectureごとのホテル数をカウント
prefecture_summary as (
    select
        address_prefecture,
        count(*) as hotel_cnt
    from example.hotel
    group by address_prefecture
)

select
    address_prefecture,
    hotel_cnt,
    --（4）address_prefectureごとのホテル数を全体のホテル数で除算して構成比を算出
    cast(hotel_cnt as float64) / total_cnt as ratio
from prefecture_summary
--（3）全体のホテル数をprefecture_summaryの全ての行に結合
cross join summary

#### Awesome

In [None]:
%%bigquery
select
    address_prefecture,
    count(*) as hotel_cnt,
    -- 都道府県ごとのホテル数をgroup byで計算し、
    -- 都道府県ごとのホテル数の合計（=全ホテル数）で除算して構成比を算出
    cast(count(*) as float64) / (sum(count(*)) over ()) as ratio
from example.hotel
group by address_prefecture

## 14-2 順序のあるウィンドウ関数
### Q: 各顧客の一つ前と二つ前の予約の予約金額の取得
#### Awesome

In [None]:
%%bigquery
select
    *,
    --customer_idごとに、reserved_at順で一つ前の予約履歴のtotal_priceを取得
    lag(total_price) over (partition by customer_id order by reserved_at)
        as total_price_at_prev,
    --customer_idごとに、reserved_at順で二つ前の予約履歴のtotal_priceを取得
    lag(total_price, 2) over (partition by customer_id order by reserved_at)
        as total_price_at_2prev
from example.reservation

### Q: 顧客ごとに予約順を付与
#### Awesome

In [None]:
%%bigquery
select
    *,
    -- row_numberで順位を取得
    -- partition by customer_idで顧客ごとに順位を取得するよう設定
    -- order by reserved_atで順位を予約日時の古い順に設定
    row_number() over (partition by customer_id order by reserved_at)
        as reservation_no
from example.reservation

## 14-3 範囲のあるウィンドウ関数
### Q: 売上の移動平均の算出
#### Awesome

In [None]:
%%bigquery
with
--（1）月ごとにtotal_priceの総和を計算（キャンセルは除く）
monthly_summary as (
    select
        month,
        --（1）-2 monthごとにtotal_priceの総和を計算
        sum(total_price) as sales
    from (
        --（1）-1 キャンセルを除いた予約を抽出し、checkout_dateの年月を抽出した文字列を作成
        select
            total_price,
            format_timestamp("%Y-%m", checkout_date, "Asia/Tokyo") as month
        from example.reservation
        where status != "canceled"
    )
    group by month
)

select
    month,
    sales,
    --（2）monthの順に2行前（2ヶ月前）から現在の行（当月）までの間のsalesの平均を計算
    avg(sales) over (order by month rows between 2 preceding and current row)
        as moving_average_sales
from monthly_summary

### Q: 顧客の過去の総予約金額を付与
#### Awesome

In [None]:
%%bigquery
select
    *,
    -- customer_idごとにreserved_at順に以前の行（過去）のtotal_priceの総和を計算
    sum(total_price)
        over (
            partition by customer_id order by reserved_at
            rows between unbounded preceding and 1 preceding
        )
        as past_total_spent
from example.reservation
where status != "canceled"

### Q: 顧客の過去90日間の総予約金額を付与
#### Awesome

In [None]:
%%bigquery
select
    *,
    -- customer_idごとにreserved_atの日付順に90日前〜1日前のtotal_priceの総和を計算
    sum(total_price)
        over (
            partition by customer_id
            order by unix_date(date(reserved_at, "Asia/Tokyo"))
            range between 90 preceding and 1 preceding
        )
        as total_spent_last_90days
from example.reservation
where status != "canceled"

## 14-4 指定列が最小／最大となる行の取得
### Q：顧客の初回予約時の情報の取得


In [None]:
%%bigquery
select
    customer_id,
    reserved_at,
    hotel_id
from example.reservation as a
where
    -- reserved_atが、customer_idごとのreserved_atの最小値となる行のみ抽出
    reserved_at = (
        select min(reserved_at)
        from example.reservation
        where customer_id = a.customer_id
    )

In [None]:
%%bigquery
select
    customer_id,
    reserved_at,
    hotel_id
from (
    select
        customer_id,
        reserved_at,
        hotel_id,
        --（1）顧客ごとに予約順の番号を付与
        row_number() over (partition by customer_id order by reserved_at) as rn
    from example.reservation
)
--（2）顧客の最初の予約のみ抽出
where rn = 1

## 14-5 層別サンプリング
### Q: 都道府県別に層別サンプリング
#### Not Awesome

In [None]:
%%bigquery
select *
from example.hotel
-- 0から1の乱数が0.1未満の行のみ抽出
where rand() < 0.1

#### Awesome

In [None]:
%%bigquery
-- （3） rand_order列とcnt列を除外
select * except (rand_order, cnt)
from (
    select
        *,
        -- （1）-1 address_prefecture列の値ごとにカウント
        count(*) over (partition by address_prefecture) as cnt,
        -- （1）-2 address_prefecture列の値ごとにランダムなデータ順を生成
        rank() over (partition by address_prefecture order by rand()) as rand_order
    from example.hotel
)
-- （2） ランダムなデータ順が全データ数の0.1倍以下の行を抽出
where rand_order <= cnt * 0.1

## 14-6 グループ単位のランダムサンプリング
### Q: 顧客単位で予約履歴をランダムサンプリング
#### Not Awesome

In [None]:
%%bigquery
select *
from example.reservation
-- 0から1の乱数が0.01未満の行のみ抽出
where rand() < 0.01

#### Awesome

In [None]:
%%bigquery
-- （3） rand_valueを除外
select * except (rand_value)
from (
    select
        *,
        -- （1） customer_idの値ごとに乱数を生成
        any_value(rand()) over (partition by customer_id) as rand_value
    from example.reservation
)
-- （2） customer_idの値ごとの乱数が0.01未満の行のみ抽出
where rand_value < 0.01