In [1]:
import pandas as pd
import numpy as np
import os

# 自作パッケージのインポート
import sys
from pathlib import Path

sys.path.append("../")
from mypoc import *

PATH_DATA: str = "../input"

### データのロード

In [2]:
path = Path("../input") / "data.pkl"

Splitter.split_data(path, "trial", "regression", target="visitors")

In [3]:
import lightgbm as lgb
from typing import Tuple


def my_make_loss(df: pd.DataFrame):
    latest_date = df["visit_date"].max()
    days_diff = (latest_date - df["visit_date"]).dt.total_seconds() / (24 * 60 * 60)
    weight = 0.99**days_diff

    def loss_func(
        preds: np.ndarray,
        train_data: lgb.Dataset,
    ) -> Tuple[np.ndarray, np.ndarray]:
        y = train_data.get_label()
        grad = weight * (preds - y)
        hess = weight
        return grad, hess

    return loss_func


run_name = "trial"
task = "regression"
target = "visitors"
features = [
    "is_Monday",
    "is_Tuesday",
    "is_Wednesday",
    "is_Thursday",
    "is_Friday",
    "is_Saturday",
    "is_Sunday",
    "is_holiday",
    "visitors_last_week",
    "is_holiday_last_week",
    "visitors_2_week_ago",
    "is_holiday_2_week_ago",
    "visitors_3_week_ago",
    "is_holiday_3_week_ago",
    "is_holiday_2_week_ago",
    "visitors_4_week_ago",
    "is_holiday_4_week_ago",
]
recipes = [
    {"model_class": ModelLGB, "model_name": "lgb", "make_loss_func": my_make_loss}
]
additions = ["visit_date"]

var = Validator(run_name, task, target, features, recipes, additions)
var.validate_attributes()

Parameter: preds, Expected type: <class 'numpy.ndarray'>
Expected return type: typing.Tuple[numpy.ndarray, numpy.ndarray]
Parameter: train_data, Expected type: <class 'lightgbm.basic.Dataset'>
Expected return type: typing.Tuple[numpy.ndarray, numpy.ndarray]




In [4]:
var

<mypoc.validator.Validator object at 0x0000022A602F9720>
run_name: trial
task: regression
target: visitors
features: ['is_Monday', 'is_Tuesday', 'is_Wednesday', 'is_Thursday', 'is_Friday', 'is_Saturday', 'is_Sunday', 'is_holiday', 'visitors_last_week', 'is_holiday_last_week', 'visitors_2_week_ago', 'is_holiday_2_week_ago', 'visitors_3_week_ago', 'is_holiday_3_week_ago', 'is_holiday_2_week_ago', 'visitors_4_week_ago', 'is_holiday_4_week_ago']
recipes: [{'model_class': <class 'mypoc.model_lgb.ModelLGB'>, 'model_name': 'lgb', 'fixed_params': None, 'make_loss_func': <function my_make_loss at 0x0000022A602F5BD0>, 'make_eval_func': None}]
additions: ['visit_date']
n_fold: 5
description: 
_is_available: True
models: [<mypoc.model_lgb.ModelLGB object at 0x0000022A2EB23BE0>]

In [5]:
rn = Runner(var)
rn.run_train_cv()

2023-06-07 23:26:41,257 - general - INFO - [2023-06-07 23:26:41] - trial - start training cv
Processing folds:   0%|          | 0/5 [00:00<?, ?it/s]2023-06-07 23:26:41,263 - general - INFO - [2023-06-07 23:26:41] - trial fold 0 - start training
[32m[I 2023-06-07 23:26:41,275][0m A new study created in memory with name: no-name-7cbdb74a-bc3d-4aeb-94a3-fc71cfd7422e[0m
[33m[W 2023-06-07 23:26:41,277][0m Trial 0 failed with parameters: {} because of the following error: TypeError("argument of type 'NoneType' is not iterable").[0m
Traceback (most recent call last):
  File "c:\Users\sumitaka.fujita\.pyenv\pyenv-win\versions\3.10.5\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "c:\Users\sumitaka.fujita\Documents\GitHub\regression\notebook\..\mypoc\runner.py", line 192, in <lambda>
    lambda trial: self._objective(
  File "c:\Users\sumitaka.fujita\Documents\GitHub\regression\notebook\..\mypoc\runner.py", line 268, in _obje

Optimization for model 0 failed with exception:
argument of type 'NoneType' is not iterable
Optimization for model 0 failed with exception:
argument of type 'NoneType' is not iterable
Optimization for model 0 failed with exception:
argument of type 'NoneType' is not iterable
Optimization for model 0 failed with exception:
argument of type 'NoneType' is not iterable
Optimization for model 0 failed with exception:
argument of type 'NoneType' is not iterable


In [10]:
# air_visit_data.csv を pandas の DataFrame の形式で読み込む
air_visit = pd.read_csv(os.path.join(PATH_DATA, "air_visit_data.csv"))

# 店舗ごとに総来店客数を算出
visitor_counts_by_store = (
    air_visit.groupby("air_store_id")["visitors"].sum().reset_index()
)

# 総来店客数が多い上位５店舗だけ表示
visitor_counts_by_store.sort_values("visitors", ascending=False).reset_index(
    drop=True
).head(5)

Unnamed: 0,air_store_id,visitors
0,air_399904bdb7685ca0,18717
1,air_f26f36ec4dc5adb0,18577
2,air_e55abd740f93ecc4,18101
3,air_99157b6163835eec,18097
4,air_5c817ef28f236bdf,18009


In [44]:
from workalendar.asia import Japan

# 　店舗 air_399904bdb7685ca0 の日ごとの来店客数を全量格納
air_visit_max = air_visit.query('air_store_id == "air_399904bdb7685ca0"').reset_index(
    drop=True
)

air_visit_max["visit_date"] = pd.to_datetime(air_visit_max["visit_date"])

air_visit_max["is"] = air_visit_max["visit_date"].dt.dayofweek

# 必要に応じて0-6の整数を文字列の曜日に変換
days = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}
air_visit_max["is"] = air_visit_max["is"].apply(lambda x: days[x])

cal = Japan()

# 'day_of_week'カラムが休日であるかどうかを示す新しいカラムを作成
air_visit_max["is_holiday"] = air_visit_max["visit_date"].apply(
    lambda x: int(cal.is_holiday(x))
)

air_visit_max = pd.get_dummies(air_visit_max, columns=["is"])

air_visit_max["visitors_last_week"] = air_visit_max["visitors"].shift(7)
air_visit_max["is_holiday_last_week"] = air_visit_max["is_holiday"].shift(7)
air_visit_max["visitors_2_week_ago"] = air_visit_max["visitors"].shift(14)
air_visit_max["is_holiday_2_week_ago"] = air_visit_max["is_holiday"].shift(14)
air_visit_max["visitors_3_week_ago"] = air_visit_max["visitors"].shift(21)
air_visit_max["is_holiday_3_week_ago"] = air_visit_max["is_holiday"].shift(21)
air_visit_max["visitors_4_week_ago"] = air_visit_max["visitors"].shift(28)
air_visit_max["is_holiday_4_week_ago"] = air_visit_max["is_holiday"].shift(28)
air_visit_max.head(50)

column_order = [
    "visit_date",
    "visitors",
    "is_Monday",
    "is_Tuesday",
    "is_Wednesday",
    "is_Thursday",
    "is_Friday",
    "is_Saturday",
    "is_Sunday",
    "is_holiday",
    "visitors_last_week",
    "is_holiday_last_week",
    "visitors_2_week_ago",
    "is_holiday_2_week_ago",
    "visitors_3_week_ago",
    "is_holiday_3_week_ago",
    "is_holiday_2_week_ago",
    "visitors_4_week_ago",
    "is_holiday_4_week_ago",
]
df = air_visit_max[column_order]

# df.head(30)
df = df[df["visit_date"] > "2016-02-01"]

df.to_pickle(os.path.join(PATH_DATA, "data.pkl"))

In [46]:
dff = pd.read_pickle(os.path.join(PATH_DATA, "data.pkl"))
dff

Unnamed: 0,visit_date,visitors,is_Monday,is_Tuesday,is_Wednesday,is_Thursday,is_Friday,is_Saturday,is_Sunday,is_holiday,visitors_last_week,is_holiday_last_week,visitors_2_week_ago,is_holiday_2_week_ago,visitors_3_week_ago,is_holiday_3_week_ago,is_holiday_2_week_ago.1,visitors_4_week_ago,is_holiday_4_week_ago
28,2016-02-02,21,0,1,0,0,0,0,0,0,52.0,0.0,23.0,0.0,43.0,0.0,0.0,4.0,0.0
29,2016-02-03,26,0,0,1,0,0,0,0,0,48.0,0.0,34.0,0.0,20.0,0.0,0.0,15.0,0.0
30,2016-02-04,53,0,0,0,1,0,0,0,0,49.0,0.0,43.0,0.0,4.0,0.0,0.0,26.0,0.0
31,2016-02-05,49,0,0,0,0,1,0,0,0,59.0,0.0,93.0,0.0,116.0,0.0,0.0,75.0,0.0
32,2016-02-06,63,0,0,0,0,0,1,0,0,18.0,0.0,123.0,0.0,51.0,0.0,0.0,91.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,2017-04-18,31,0,1,0,0,0,0,0,0,4.0,0.0,17.0,0.0,16.0,0.0,0.0,51.0,0.0
453,2017-04-19,17,0,0,1,0,0,0,0,0,23.0,0.0,25.0,0.0,34.0,0.0,0.0,27.0,1.0
454,2017-04-20,24,0,0,0,1,0,0,0,0,16.0,0.0,15.0,0.0,61.0,0.0,0.0,16.0,0.0
455,2017-04-21,20,0,0,0,0,1,0,0,0,44.0,0.0,7.0,0.0,22.0,0.0,0.0,47.0,0.0


In [50]:
from typing import Any, Callable, Union

def check_param_type(param, name, expected_type):
    if not isinstance(param, expected_type):
        raise TypeError(f"Parameter '{name}' must be of type {expected_type}.")

def enclosure(df: pd.DataFrame):
    def closure(val: float):
        pass
    
    return closure``

        
type = Callable[[pd.DataFrame], Callable]
b
check_param_type(enclosure, "alpha", type)

TypeError: Subscripted generics cannot be used with class and instance checks

## 損失関数の設計

In [55]:
from inspect import signature, Parameter
import lightgbm as lgb
from typing import Tuple


def enclosure(
    df: pd.DataFrame,
) -> Callable[[np.ndarray, lgb.Dataset], Tuple[np.ndarray, np.ndarray]]:
    def closure(
        pred: np.ndarray, train_data: lgb.Dataset
    ) -> Tuple[np.ndarray, np.ndarray]:
        return a, b

    return closure


sig = signature(enclosure)
params = sig.parameters


for param in sig.parameters.values():
    print(f"Parameter: {param.name}, Expected type: {param.annotation}")
    print(f"Expected return type: {sig.return_annotation}")
# if len(params) != 2:
#     raise ValueError(
#         "The custom loss function must have exactly two arguments"
#     )

Parameter: df, Expected type: <class 'pandas.core.frame.DataFrame'>
Expected return type: typing.Callable[[numpy.ndarray, lightgbm.basic.Dataset], typing.Tuple[numpy.ndarray, numpy.ndarray]]


In [None]:
# RunnerParams
from typing import Callable
import lightgbm as lgb


def loss_for_LGB_example(
    X: pd.DataFrame,
    time_col: str,
    r: float,
) -> Callable:
    """
    LightGBM の損失関数のエンクロージャ。

    最新日時との差分から、重みを計算する。

    Parameters
    ----------
    X : pd.DataFrame
        データフレーム

    time_col : str
        タイムスタンプのカラム名

    r : float
        忘却率

    Returns
    -------
    Callable
    """
    # 忘却率に基づいた重みを設定する
    df[time_col] = pd.to_datetime(df[time_col])
    latest_time = df[time_col].max()
    df["time_difference"] = latest_time - df[time_col]
    weight = r ** df["time_difference"]

    def loss_func(preds: np.ndarray, train_data: lgb.Dataset):
        """
        LightGBM の損失関数

        Parameters
        ----------
        preds : np.ndarray
            予測値

        train_data : lgb.Dataset
            実測値

        Returns
        -------
        Tuple(np.ndarray, np.ndarray)
        """
        y = train_data.get_label()
        grad = weight * (preds - y)
        hess = weight * np.ones(len(y))

        return grad, hess

    return loss_func

In [3]:
params_dict = {
    "run_name": "first",
    "task": "regression",
    "target": "target",
    "features": list(X.columns),
    "model_recipes": model_recipes,
    "model_classes": [ModelLGB],
    "fixed_params": [{}],
    "make_loss_funcs": [loss_for_LGB],
    "make_eval_funcs": [loss_for_LGB],
    "description": "trial",
}
rp = RunnerParams(**params_dict)

# len([loss_for_LGB])

In [4]:
DataSplitter.split_data("data.pkl", "first", "regression")

In [5]:
# xgboostによる学習・予測
rr = Runner(rp)
rr.run_train_cv()
# runner = Runner('xgb1', ModelLGB, features, params_xgb)
# runner.run_train_cv()
# runner.run_predict_cv()
# Submission.create_submission('xgb1')

[2023-05-30 17:26:54] - first - start training cv
Processing folds:   0%|          | 0/5 [00:00<?, ?it/s][2023-05-30 17:26:54] - first fold 0 - start training
[32m[I 2023-05-30 17:26:54,758][0m A new study created in memory with name: no-name-1e47e93d-9d71-4204-86bf-0b5f615c2bf4[0m
[33m[W 2023-05-30 17:26:54,767][0m Trial 0 failed with parameters: {'reg_alpha': 0.01008701813668789, 'reg_lambda': 0.0005735101632254607, 'num_leaves': 5, 'colsample_bytree': 0.8070734651702838, 'subsample': 0.627175178502248, 'subsample_freq': 0, 'min_child_samples': 5} because of the following error: ValueError('not enough values to unpack (expected 3, got 2)').[0m
Traceback (most recent call last):
  File "c:\Users\sumitaka.fujita\.pyenv\pyenv-win\versions\3.10.5\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "c:\Users\sumitaka.fujita\Documents\GitHub\regression\notebook\..\mypoc\runner.py", line 230, in <lambda>
    lambda trial: self

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 514
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10


ValueError: not enough values to unpack (expected 3, got 2)

In [7]:
params = {
            "n_splits": 5,
            "shuffle": True,
            "random_state": 10,
        }
**params

SyntaxError: invalid syntax (461170021.py, line 6)