# Common settings

In [None]:
# 何かインストールしたいライブラリーがあればここに追加。
# !pip install --target /content/drive/MyDrive/Colab\ Notebooks/my-modules kanjize
# !pip install --target /content/drive/MyDrive/Colab\ Notebooks/my-modules optuna
# !pip install --target /content/drive/MyDrive/Colab\ Notebooks/my-modules optuna-integration
# !pip install --target /content/drive/MyDrive/Colab\ Notebooks/my-modules shap
# !pip install --target /content/drive/MyDrive/Colab\ Notebooks/my-modules signate
# !pip install --target /content/drive/MyDrive/Colab\ Notebooks/my-modules imblearn
# !pip install --target /content/drive/MyDrive/Colab\ Notebooks/my-modules umap-learn
import sys
sys.path.append("/content/drive/MyDrive/Colab Notebooks/my-modules")

# Settings for signate

In [None]:
from googleapiclient.discovery import build
import io
import os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

# Google Colab上でGoogleアカウントに認証を行い、APIにアクセスできるようにします。
auth.authenticate_user()
# build('drive', 'v3'): Google Drive API v3を使用するためのクライアントを作成します。
drive_service = build('drive', 'v3')
# drive_service.files().list(...): Google Drive 内のファイルを検索します。この例では、ファイル名が 'signate.json' であるファイルを探しています。
# fields="files(id)": 検索結果からファイルのIDのみを取得します。
results = drive_service.files().list(q="name = 'signate.json'", fields="files(id)").execute()
# signate_api_key: 見つかったファイルのリストを取得します。通常は1つのファイルが見つかるはずです。
signate_api_key = results.get('files', [])
# filename: ダウンロードしたファイルを保存するローカルのパスを指定します。この例では、/content/.signate/signate.json というディレクトリに保存します。
filename = "/root/.signate/signate.json"
# os.makedirs: 指定されたディレクトリが存在しない場合、ディレクトリを作成します。
os.makedirs(os.path.dirname(filename), exist_ok=True)
# drive_service.files().get_media: ファイルの内容を取得するリクエストを作成します。
request = drive_service.files().get_media(fileId=signate_api_key[0]['id'])
# io.FileIO(filename, 'wb'): ファイルをバイナリ書き込みモード (wb) で開きます。
fh = io.FileIO(filename, 'wb')
# MediaIoBaseDownload(fh, request): ダウンロードを管理するためのダウンローダーを作成します。
downloader = MediaIoBaseDownload(fh, request)
# downloader.next_chunk(): ファイルをチャンク（小さな部分）ごとにダウンロードします。ダウンロードの進捗状況を表示しながら、ファイル全体をダウンロードします。
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
# os.chmod(filename, 600): ダウンロードしたファイルに対して、オーナーにのみ読み書き権限を付与します。
os.chmod(filename, 600)

In [None]:
competition_id = 1387
# !pip install signate
# !signate list
# !signate files --competition-id=competition_id
# !signate download --competition-id=competition_id --path=/content/drive/MyDrive/Colab\ Notebooks/Signate_50th_for_beginner/data/input

# Library

In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
import seaborn as sns
import unicodedata
from kanjize import kanji2number
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='matplotlib')
np.random.seed(1234)
random.seed(1234)

# Function

In [3]:
# データフレームの　①欠損値の数、②ユニークな値の種類数、③データ型、④ユニークな値の名前とその出現数の辞書を表示する関数。
def info (df):
    df = pd.DataFrame({
        "null_count": df.isnull().sum(),
        "unique_count": df.nunique(),
        "dtype": df.dtypes,
        "value_counts": df.apply(lambda x: x.value_counts().to_dict())
    })
    return df


# 数値型の変数について目的変数ごとにヒストグラムを作成する関数(target_variableはオブジェクト型であること)。
def draw_histogram_by_target(df, target_variable, binwidth=0.5):
    for num in df.select_dtypes(include='number').columns:
        if num != target_variable:

            plt.figure(figsize=(2, 1.5))

            percentile_1 = df[num].quantile(0.01)
            percentile_99 = df[num].quantile(0.99)
            plt.axvline(x=percentile_1, color='red', linestyle='--', linewidth=1)
            plt.axvline(x=percentile_99, color='red', linestyle='--', linewidth=1)

            sns.histplot(data=all, x=num, hue=target_variable, multiple='stack', binwidth=binwidth)
            plt.title(f'{num} by {target_variable}')
            plt.xlabel(num)
            plt.ylabel('Frequency')
            plt.grid(False)
            plt.show()


# 数値型の変数について二値の目的変数ごとに棒グラフを作成する関数(target_variableはオブジェクト型であること)。
def draw_barplot_by_target(df, target_variable):
    if df[target_variable].nunique() != 2:   # 目的変数が二値であることを確認
        raise ValueError(f"{target_variable} is not a binary variable.")
    for cat in df.select_dtypes(include='object').columns:
        if cat != target_variable:
            plt.figure(figsize=(2, 1.5))
            sns.barplot(data=df, x=cat, y=target_variable)
            plt.title(f'{target_variable} by {cat}')
            plt.xlabel(cat)
            plt.ylabel(f'Mean of {target_variable} (proportion)')
            plt.xticks(rotation=45)
            plt.grid(True)
            plt.ylim((0, 1))
            plt.show()

# Analysis before preprocessing

In [4]:
dir = "/content/drive/MyDrive/Colab Notebooks/Signate_50th_for_beginner/data"
train = pd.read_csv(dir + "/input/train.csv")
test = pd.read_csv(dir + "/input/test.csv")
all = pd.concat([train, test], sort = False).reset_index(drop=True)
# all = all.drop(["id"], axis=1)

NameError: name 'pd' is not defined

In [None]:
print(info(all))
all

In [None]:
# draw_histogram_by_target(all, "disease")

In [None]:
# draw_barplot_by_target(all, "disease")

# Analysis after preprocessing

In [None]:
# all["Elders"] = all["Age"].apply(lambda x: 1 if x >= 65 else 0)
# all["Elders"] = all["Elders"].astype("category")

# bins = list(range(0, 101, 10))  # 0, 10, 20, ..., 100
# labels = [i for i in range(0, len(bins) - 1)]
# print(bins)
# print(labels)
# all['Age_group'] = pd.cut(all['Age'], bins=bins, labels=labels)
# all['Age_group'] = all['Age_group'].astype(str)
# print(all['Age_group'].value_counts())
# all['Age_group_Gender'] = all['Age_group'] + '_' + all['Gender']
# all["Age_group_Gender_mean"] = all[["Age_group_Gender", "disease"]].groupby("Age_group_Gender").transform("mean")
# all = all.drop(["Age_group", "Age_group_Gender"], axis = 1)
# plt.figure(figsize=(8, 4))
# sns.barplot(data=all, x="Age_group_Gender", y="disease")
# plt.title("disease by Age_group_Gender")
# plt.xlabel("Age_group_Gender")
# plt.ylabel("disease")
# plt.xticks(rotation=45)
# plt.show()

In [None]:
# # カテゴリ変数と数値変数の組み合わせで平均値を作成
# for cat_col in all.select_dtypes(include=object).columns:
#     for num_col in all.select_dtypes(include='number').columns:
#         if num_col != 'disease':
#             all[f'{num_col}_mean_by_{cat_col}'] = all.groupby(cat_col)[num_col].transform('mean')

In [None]:
all["AST_GOT/ALT"] = all["AST_GOT"] / all["ALT_GPT"]
all["T_Bil*AST_GOT"] = all["T_Bil"] * all["AST_GOT"]
all["D_Bil/Alb"] = all["D_Bil"] / all["Alb"]
# all["D_Bil/T_Bil"] = all["D_Bil"] / all["T_Bil"]
all = all.drop(["Alb", "D_Bil"], axis = 1)
# # all["T_Bil*ALT_GPT"] = all["T_Bil"] * all["ALT_GPT"]

In [None]:
# from itertools import combinations

# num_columns = all.select_dtypes(include='number').drop(["disease"], axis = 1).columns

# # 全列のペアの組み合わせを生成
# for col1, col2 in combinations(num_columns, 2):
#     all[f'{col1}/{col2}'] = all[col1] / all[col2]
#     all[f'{col1}*{col2}'] = all[col1] * all[col2]

In [None]:
num_columns = all.select_dtypes(include='number').drop(['disease'], axis = 1).columns
p01 = all[num_columns].quantile(0.01)
p90 = all[num_columns].quantile(0.90)
all[num_columns] = all[num_columns].clip(lower=p01, upper=p90, axis = 1)

# 1. 標準化
# 2. PCA と LDA(Linear Discrimination Analysis: 線形判別分析)

In [None]:
from sklearn.preprocessing import StandardScaler
columns = all.select_dtypes(include='number').drop(['disease'], axis = 1).columns
scaler = StandardScaler()
all[columns] = scaler.fit_transform(all[columns])

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)  # 第3主成分までを抽出
principal_components = pca.fit_transform(all[columns])
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])
all = pd.concat([all, pca_df], axis=1)


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)
lda.fit(all.loc[~all['disease'].isnull(), columns], all.loc[~all['disease'].isnull(), 'disease'])
lda_components = lda.transform(all[columns])
lda_df = pd.DataFrame(data=lda_components, columns=['LDA'])
all = pd.concat([all, lda_df], axis=1)

# !pip install umap-learn | tail -n 1
import umap
um = umap.UMAP(random_state=1234)
um.fit(all.loc[~all['disease'].isnull(), columns])
um_components = um.transform(all[columns])
um_df = pd.DataFrame(data=um_components, columns=['UMAP1', 'UMAP2'])
all = pd.concat([all, um_df], axis=1)

# from sklearn.cluster import MiniBatchKMeans
# kmeans = MiniBatchKMeans(n_clusters=3, random_state=1234)
# kmeans.fit(all.loc[~all['disease'].isnull(), columns])
# kmeans_distances = kmeans.transform(all[columns])
# kmeans_df = pd.DataFrame(data=kmeans_distances, columns=['kmeans1', 'kmeans2', 'kmeans3'])
# all = pd.concat([all, kmeans_df], axis=1)

# all = all.drop(['kmeans2', 'kmeans3'], axis = 1)

In [None]:
from sklearn.preprocessing import LabelEncoder
# columns = ["Gender", "Age_group", "Age_group_Gender"]
columns = ["Gender"]
for col in columns:
    if all[col].dtype == 'object':
        le = LabelEncoder()
        all[col] = le.fit_transform(all[col])
        all[col] = all[col].astype('category')

# モデリング

In [None]:
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
train_X = all.loc[~all['disease'].isnull(),:].drop(['disease'], axis=1).reset_index(drop=True)
train_Y = all.loc[~all['disease'].isnull(),"disease"].reset_index(drop=True)
test_X = all.loc[all['disease'].isnull(),:].drop(['disease'], axis=1).reset_index(drop=True)

In [None]:
# plt.figure(figsize=(10,10))
# options = {'square':True, 'annot':True, 'fmt':'0.2f', 'xticklabels':all.columns, 'yticklabels':all.columns, 'annot_kws':{'size':8}, 'vmin':-1,'vmax':1,'center':0, 'cbar':False}
# ax = sns.heatmap(all.corr(), **options)
# ax.tick_params(axis='x', labelsize=12)
# ax.tick_params(axis='y', labelsize=12)


### モデリング（ハイパーパラメーターの最適化）

In [None]:
# import optuna
# from optuna.integration import LightGBMPruningCallback

In [None]:
# # https://note.nkmk.me/python-sklearn-train-test-split/
# X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_Y, test_size=0.2, random_state=1234, stratify=train_Y)

# pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
# # 目的関数
# def objective(trial):
#     params = {
#         "objective": "binary",
#         "seed": 1234,
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
#         "num_leaves": trial.suggest_int("num_leaves", 10, 256),
#         'n_estimators': trial.suggest_int("n_estimators", 50, 1000),
#         "verbosity": -1,
#         "boosting_type": "gbdt",
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100),
#         "max_depth": trial.suggest_int("max_depth", 5, 20),
#         "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
#         "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
#         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 1e-8, 15.0, log=True),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
#         "num_iterations": 100,
#         "early_stopping_round": 20,
# #         "scale_pos_weight": pos_weight,
#         "verbose": -1,
#         "metric": "auc"
#     }

#     lgb_train = lgb.Dataset(X_train, y_train)
#     lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
#     model_lgb = lgb.train(
#         params,
#         lgb_train,
#         valid_sets=lgb_eval,
#         num_boost_round=100,
#         callbacks=[early_stopping(stopping_rounds=20),
#                    LightGBMPruningCallback(trial, "auc")]
#     )
#     y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
#     score = roc_auc_score(y_valid, y_pred)
#     return score

# # Optunaでの最適化
# study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0), direction="maximize")
# study.optimize(objective, n_trials=50)

# print("Best parameters:", study.best_params)
# print("Best score:", study.best_value)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1234)

folds = 10
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1234)

models_lgb = []
oof_lgb = np.zeros(len(train_X))

params_lgb = {
    "objective" : "binary",  # default : "regression"
    "random_seed" : 1234,  # default : None
    # 'learning_rate': 0.04642658243447123,  # default : 0.1
    'num_leaves': 175,  # default : 31
    'n_estimators': 500,  # default : 100 (num_boost_round)
    # 'min_data_in_leaf': 17,  # default : 20
    # 'max_depth': 10,  # default : -1 (no limit)
    # 'lambda_l1': 1.8767090076326167e-05,  # default : 0.0
    # 'lambda_l2': 0.0013544748136111338,  # default : 0.0
    # 'min_gain_to_split': 0.00010583943526512414,  # default : 0.0
    # 'bagging_fraction': 0.990699070447381,  # default : 1.0
    # 'bagging_freq': 1,  # default : 0 (disabled)
    # 'feature_fraction': 0.3671014048758678,  # default : 1.0
    "metric" : "auc",  # default : None (based on objective)
    "verbosity": -1  # default : 0
}

for train_index, val_index in kf.split(train_X, train_Y):
    X_train = train_X.iloc[train_index]
    X_valid = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    train_lgb = lgb.Dataset(X_train_resampled, y_train_resampled)
    eval_lgb = lgb.Dataset(X_valid, y_valid, reference=train_lgb)
    # train_lgb = lgb.Dataset(X_train, y_train)
    # eval_lgb = lgb.Dataset(X_valid, y_valid, reference=train_lgb)

    model_lgb = lgb.train(
        params_lgb,
        train_lgb,
        valid_sets=eval_lgb,
        num_boost_round=100,
        callbacks=[early_stopping(stopping_rounds=40)]
        # callbacks=[early_stopping(stopping_rounds=20), log_evaluation(period=20)]
    )
    y_pred_lgb = model_lgb.predict(X_valid, num_iteration = model_lgb.best_iteration)
    # AUCの評価はroc_auc_score(正解データ, 予測データ)で可能です。
    # 成約したかどうかを判定するため、2列目の1の確率(成約している確率)を予測データとして入力します。
    score = roc_auc_score(y_valid, y_pred_lgb)
    print(score)

    models_lgb.append(model_lgb)
    oof_lgb[val_index] = y_pred_lgb

In [None]:
from sklearn.metrics import roc_curve

for model_name in ["lgb"]:
    print(model_name)
    if model_name == "lgb":
        oof = oof_lgb
    elif model_name == "xgb":
        oof = oof_xgb
    elif model_name == "rf":
        oof = oof_rf
    # fpr, tpr, thresholds = roc_curve(train_Y, oof)
    # plt.figure(figsize=(2, 1.5))
    # plt.plot([0, 1], [0, 1], 'k--')
    # plt.plot(fpr,tpr,label=model_name)
    # plt.xlabel('False Positive Rate')
    # plt.ylabel('True Positive Rate')
    # plt.title('ROC Curve')
    # plt.show()
    print(f"Score: {roc_auc_score(train_Y, oof)}\n")

# print("ensemble")
# oof_ens = oof_lgb * 0.4 + oof_xgb * 0.4 + oof_rf * 0.2
# fpr, tpr, thresholds = roc_curve(train_Y, oof_ens)
# plt.figure(figsize=(2, 1.5))
# plt.plot([0, 1], [0, 1], 'k--')
# plt.plot(fpr,tpr,label="ensemble oof")
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.show()
# print(f"Score: {roc_auc_score(train_Y, oof_ens)}\n")

In [None]:
preds_lgb = []
for model_lgb in models_lgb:
    pred_lgb = model_lgb.predict(test_X, num_iteration = model_lgb.best_iteration)
    preds_lgb.append(pred_lgb)
preds_lgb_array = np.array(preds_lgb)
preds_lgb_mean = np.mean(preds_lgb_array, axis = 0)

In [None]:
# import shap
# shap.initjs()

# explainer = shap.TreeExplainer(model_lgb)
# shap_values = explainer(X_train)
# shap.summary_plot(shap_values, X_train)

# for col in X_train.columns:
#     shap.dependence_plot(col, shap_values.values, X_train)
#     # interaction_values = shap_values.values[:, :, 1]  # ここで「1」は対象の特徴量のインデックス
#     # print("Interaction values for feature 'A':")
#     # print(interaction_values)
#     plt.show()


# # 特徴量の寄与（SHAP値の絶対値の平均）を計算
# mean_abs_shap = np.abs(shap_values.values).mean(axis=0)

# # データフレーム化して、寄与の小さい順にソート
# shap_importance_df = pd.DataFrame({
#     'feature': X_train.columns,
#     'mean_abs_shap': mean_abs_shap
# })

# # 寄与が小さい特徴量を抽出（例：寄与が小さい順に上位10個）
# low_contributing_features = shap_importance_df.sort_values(by='mean_abs_shap').head(10)['feature'].tolist()

# # 寄与が小さい特徴量をリストで表示
# print(low_contributing_features)

# Submit predictions to signate

In [None]:
import pandas as pd
from datetime import datetime

dt = datetime.now()
datetime_str = dt.strftime("%Y-%m-%d_%H:%M")

submit = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Signate_50th_for_beginner/data/input/sample_submit.csv", header=None)
submit[1] = preds_lgb_mean
submit.head()
#submission.csvでデータを保存(提出様式はindex=False, header=Noneとしてください。)
submit.to_csv(f"/content/drive/MyDrive/Colab Notebooks/Signate_50th_for_beginner/data/output/sample_submit_{datetime_str}.csv", index=False, header=None)

In [None]:
# !pip install signate | tail -n 1
# !signate submit --competition-id=1387 "/content/drive/MyDrive/Colab Notebooks/Signate_50th_for_beginner/data/output/sample_submit_{datetime_str}.csv"