# 提出までの流れ

1. 環境の導入
   - lifelinesライブラリのインストール
   - score関数の導入
2. データの整形
3. 学習
4. データの提出

---

## 1. 環境の導入

### lifelinesライブラリのインストール

In [1]:
# オフラインノートでlifeinesをインストールするためにInputに"pip install lifelines"を追加します
# https://www.kaggle.com/code/cdeotte/pip-install-lifelines/notebook
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
Installing collected packages: autograd
Successfully installed autograd-1.7.0
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l- \ done
[?25h  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4030 sha256=a798a3bb9250e4c2f9e7db57000ae90f3a455083d3b08d1014ca55f1469885bb
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Successfully installed interface-meta-1.3.0


### score関数の定義

この関数も配布物を使用します

[Discussion: How To Get Started - Understanding the Metric](https://www.kaggle.com/competitions/equity-post-HCT-survival-predictions/discussion/550003)

[Notebook: eefs_concordance_index](https://www.kaggle.com/code/metric/eefs-concordance-index)

In [2]:
import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index

class ParticipantVisibleError(Exception):
    pass

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """ Cloned from ANDREW's Notebook
    efs, efs_timeから今回の
    """
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

## 2. データの整形

ここでは各データ型の定義のみを行います

参考: [XGBoost CatBoost Baseline - \[CV 668 LB 668\]](https://www.kaggle.com/code/cdeotte/xgboost-catboost-baseline-cv-668-lb-668)

In [3]:
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

import numpy as np
import polars as pl
import pandas as pd

import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse

In [4]:
def load_data(path, batch_size=32768):
    """
    データを読み込むための関数です
    """
    df = pl.read_csv(path, batch_size=batch_size)
    num_cols = [
        'hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10',
        'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low',
        'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct',
        'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8',
        'hla_match_drb1_high', 'hla_low_res_10', 'efs', 'efs_time' ]

    for col in df.columns:
        if col in num_cols:
            df = df.with_columns(pl.col(col).fill_null(-1).cast(pl.Float32))  
        else:
            df = df.with_columns(pl.col(col).fill_null('Unknown').cast(pl.String))
    df = df.with_columns(pl.col('ID').cast(pl.Int32))
    df = df.to_pandas()

    cat_cols = [col for col in df.columns if df[col].dtype == pl.String]
    return df, cat_cols

In [5]:
from lifelines import KaplanMeierFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
    
def get_target(data):
    """
    コンペの目的変数を作成するための関数です
    """
    data['target'] = transform_survival_probability(data)

    target = data['target'].copy()
    data = data.drop(['target'], axis=1)
    
    return target

In [6]:
def feature_engineering(data, cat_cols):
    """
    学習と推論のためのX,yを作成するための関数です
    """
    has_y = 'efs' in data.columns and 'efs' in data.columns
    y = None
    if has_y:
        y = get_target(data)

    cols_except = ['ID', 'efs', 'efs_time', 'target']
    cols_drop = [col for col in data.columns if col in cols_except]
    X = data.drop(cols_drop, axis=1)

    for col in cat_cols:
        X[col] = X[col].astype('category')
    
    return X, y, 

import category_encoders as ce
def apply_ce(X, cat_col, transformer=None):
    """
    カテゴリをint型に変換するための関数です
    """
    if transformer == None:
        transformer = ce.OrdinalEncoder(cols=cat_cols,handle_unknown='impute')
        X = transformer.fit_transform(X)
    else:
        X = transformer.transform(X)

    return X

In [7]:
train_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
subm_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')

train_data, cat_cols = load_data(train_path)
test_data, _ = load_data(test_path)

X, y = feature_engineering(train_data, cat_cols)
X = apply_ce(X, cat_cols)

# 3. 推論

LightGBMで学習を行います

In [8]:
n_splits = 5
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    }

models, fold_scores = [], []
    
cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))

for fold, (train_index, valid_index) in enumerate(cv.split(X, y), 1):
    X_train = X.iloc[train_index]
    X_valid = X.iloc[valid_index]
        
    y_train = y.iloc[train_index]
    y_valid = y.iloc[valid_index]
    
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train, 
        y_train,  
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(1000, verbose=0), 
                   lgb.log_evaluation(0)]
    )
    
    models.append(model)
    oof_preds[valid_index] = model.predict(X_valid)
    
    fold_score = mse(y_valid, oof_preds[valid_index], squared=False)
    fold_scores.append(fold_score)
    
y_true = train_data[['ID', 'efs', 'efs_time', 'race_group']].copy()
y_pred = train_data[['ID']].copy()
y_pred['prediction'] = -oof_preds

c_index_score = score(y_true.copy(), y_pred.copy(), "ID")

print(f'\nOverall C-Index for lightGBM: {c_index_score:.3f}\n')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 865
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 57
[LightGBM] [Info] Start training from score 0.606473
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 865
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 57
[LightGBM] [Info] Start training from score 0.605167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003875 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

In [9]:
fold_scores

[0.15674363940163843,
 0.1581887224222684,
 0.15696357887371534,
 0.1582094279827603,
 0.15824950691196402]

# 4. データの提出

提出データを作成します

In [10]:
X_test, _ = feature_engineering(test_data, cat_cols)
X_test = apply_ce(X_test, cat_cols)
preds = np.mean([model.predict(X_test) for model in models], axis=0)

subm_data = pd.read_csv(subm_path)
subm_data['prediction'] = preds

subm_data.to_csv('submission.csv', index=False)