In [None]:
import pathlib
import pandas as pd

datadir = pathlib.Path.cwd().parent / "data"

pd_production             = pd.read_parquet(datadir / "production.parquet")
pd_production_missing_num = pd.read_parquet(datadir / "production_missing_num.parquet")
pd_reservation            = pd.read_parquet(datadir / "reservation.parquet")
pd_customer               = pd.read_parquet(datadir / "customer.parquet")

# 10章 数値
## 10-1 数値型への変換
### Q: さまざまな数値型への変換


#### Awesome

In [None]:
(
    pd.DataFrame({"v1": [40000], "v2": [3]})
    .assign(
        # (1)-1 int64へ変換
        v1_int64=lambda df: df.v1.astype("int"),
        v2_int64=lambda df: df.v2.astype("int"),
        # (1)-2 uint64へ変換
        v1_uint64=lambda df: df.v1.astype("uint"),
        v2_uint64=lambda df: df.v2.astype("uint"),
        # (1)-3 float64へ変換
        v1_float64=lambda df: df.v1.astype("float64"),
        v2_float64=lambda df: df.v2.astype("float64"),
        # (1)-4 Int64へ変換
        v1_Int64=lambda df: df.v1.astype("Int64"),
        v2_Int64=lambda df: df.v2.astype("Int64"),
        # (2)-1 int64同士の計算
        res_int64=lambda df: df.v1_int64 / df.v2_int64,
        # (2)-2 uint64同士の計算
        res_uint64=lambda df: df.v1_uint64 / df.v2_uint64,
        # (2)-3 float64同士の計算
        res_float64=lambda df: df.v1_float64 / df.v2_float64,
        # (2)-4 Int64Dtype同士の計算
        res_int64dtype=lambda df: df.v1_Int64 / df.v2_Int64
    )
)

## 10-2 数値の欠損処理
### Q: `thickness`が欠損しているレコードの削除


#### Awesome

In [None]:
pd_production_missing_num.dropna(subset="thickness")

#### Awesome

In [None]:
pd_production_missing_num.loc[lambda df: df.thickness.notnull()]

### Q: 欠損している`thickness`を定数で補完


#### Awesome

In [None]:
pd_production_missing_num.fillna({"thickness": 1})

### Q: 欠損しているthicknessを平均値で補完


#### Awesome

In [None]:
(
    pd_production_missing_num
    .fillna({"thickness": pd_production_missing_num.thickness.mean()})
)

## 10-3 数値の外れ値除去
### Q: `thickness`の外れ値を四分位数ベースの外れ値検出で除去


#### Awesome

In [None]:
# （1） Q1、Q3、IQRの計算
q1 = pd_production.thickness.quantile(0.25)
q3 = pd_production.thickness.quantile(0.75)
iqr = q3 - q1

# （2） Q1、Q3、IQRを用いて外れ値を除去
pd_production.loc[lambda df: df.thickness.between(q1 - 1.5 * iqr, q3 + 1.5 * iqr)]

## 10-4 数値変換
### Q: 予約の合計金額の標準化


#### Awesome 1

In [None]:
(
    pd_reservation
    .assign(
        total_price=lambda df: (df.total_price - df.total_price.mean())
            / df.total_price.std()
    )
)

#### Awesome 2

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

# （1） 変換を行うオブジェクトを生成
transformer = make_column_transformer(
    (StandardScaler(), ["total_price"]),
    remainder="passthrough", verbose_feature_names_out=False)
transformer.set_output(transform="pandas")

# （2） 標準化変換を実施
transformer.fit_transform(pd_reservation)

### Tips: scikit-learnの機械学習モデルや`Transformer`をデータに`fit`した後のオブジェクトの保存と再利用


In [None]:
import pickle

# transformer.pklファイルに保存
with open('transformer.pkl', 'wb') as f:
    pickle.dump(transformer, f)

In [None]:
# transformer.pklファイルを読み込み
with open('transformer.pkl', 'rb') as f:
    transformer_loaded = pickle.load(f)

# 読み込んだオブジェクトを利用してデータを変換
transformer_loaded.transform(pd_reservation)

### Q: 予約の合計金額の対数変換


#### Awesome

In [None]:
import numpy as np

(
    pd_reservation
    .assign(total_price=lambda df: np.log(df.total_price))
)

#### Awesome

In [None]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer

# （1） 変換を行うオブジェクトを生成
transformer = make_column_transformer(
    (FunctionTransformer(func=np.log), ["total_price"]),
    remainder="passthrough", verbose_feature_names_out=False)
transformer.set_output(transform="pandas")

# （2） 標準化変換を実施
transformer.fit_transform(pd_reservation)

## 10-5 数値のカテゴリ化
### Q: 顧客の年齢のカテゴリ化


#### Awesome

In [None]:
(
    pd_customer
    .assign(age_cat=lambda df: (df.age / 10).astype(int).astype("category"))
)