In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from iblm import IBLMClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from tqdm import tqdm

sys.path.append("..")

## evaluation

In [2]:
def code_model_evaluation(model_file: str, test_df: pd.DataFrame, target_colname: str) -> dict:
    model_file = Path(model_file)
    model_name = model_file.stem
    import_file = f"import models.{model_file.parent.name}.{model_name} as codemodel"
    with open(model_file, "r") as fp:
        code = fp.read()

    try:
        exec(import_file, globals())
    except:
        result = dict(
            model_name=model_name,
            status="FAILED",
            comment="1_import_error",
            acc=None,
            auc=None,
            code=code,
        )
        return result

    try:
        x_test = test_df.drop(target_colname, axis=1)
        y_test = test_df[target_colname]
        y_proba = codemodel.predict(x_test)
        y_pred = (y_proba > 0.5).astype(int)
        negative_values_exist = np.any(y_proba < 0)
        values_greater_than_one_exist = np.any(y_proba > 1)
        if negative_values_exist:
            result = dict(
                model_name=model_name,
                status="FAILED",
                # comment="negative_values_exist",
                comment="3_invalid_pred_value",
                acc=None,
                auc=None,
                code=code,
            )

        elif values_greater_than_one_exist:
            result = dict(
                model_name=model_name,
                status="FAILED",
                # comment="values_greater_than_one_exist",
                comment="3_invalid_pred_value",
                acc=None,
                auc=None,
                code=code,
            )

        else:
            roc_auc = roc_auc_score(y_test, y_proba)
            accuracy = round(accuracy_score(y_test, y_pred), 4)
            result = dict(
                model_name=model_name,
                status="SUCCEEDED",
                comment="0_succeeded",
                acc=accuracy,
                auc=roc_auc,
                code=code,
            )
    except Exception as e:
        result = dict(
            model_name=model_name,
            status="FAILED",
            comment="2_predict_method_error",
            acc=None,
            auc=None,
            code=code,
        )

    finally:
        return result

In [3]:
def evaluate_all_code_models(model_name: str, seeds: list, train_nums: list, target_colname: str) -> pd.DataFrame:
    """this is for next models
    * moon
    * pseudodata
    * titanic
    """
    results = []
    for seed in tqdm(seeds):
        df = pd.read_csv(f"../data/{model_name}/{model_name}_{seed}_300_test.csv")
        for train_num in tqdm(train_nums, leave=False):
            for trial in range(1, 31):
                model_file = f"../models/{model_name}/{model_name}_{seed}_{train_num}_{trial}.py"
                results.append(
                    dict(
                        seed=seed,
                        train_num=train_num,
                        **code_model_evaluation(model_file, df, target_colname),
                    )
                )
    print("Done!")
    return pd.DataFrame(results)


def evaluate_all_code_models_circle(
    model_name: str, seeds: list, train_nums: list, target_colname: str
) -> pd.DataFrame:
    """this is for next models
    * circle
    """
    results = []
    for seed in tqdm(seeds):
        df = pd.read_csv(f"../data/{model_name}/{model_name}_300_test.csv")
        for train_num in tqdm(train_nums, leave=False):
            for trial in range(1, 31):
                model_file = f"../models/{model_name}/{model_name}_{trial}.py"
                results.append(
                    dict(
                        seed=seed,
                        train_num=train_num,
                        **code_model_evaluation(model_file, df, target_colname),
                    )
                )
    print("Done!")
    return pd.DataFrame(results)


def evaluate_all_code_models_text(model_name: str, seeds: list, train_nums: list, target_colname: str) -> pd.DataFrame:
    """this is for next models
    * text
    """
    results = []
    for seed in tqdm(seeds):
        df = pd.read_csv(f"../data/{model_name}/{model_name}_25_test.csv")
        for train_num in tqdm(train_nums, leave=False):
            for trial in range(1, 31):
                model_file = f"../models/{model_name}/{model_name}_{trial}.py"
                results.append(
                    dict(
                        seed=seed,
                        train_num=train_num,
                        **code_model_evaluation(model_file, df, target_colname),
                    )
                )
    print("Done!")
    return pd.DataFrame(results)

## dataframe processing

In [4]:
def metrics_summary(df: pd.DataFrame) -> None:  # , output_dir: str) -> None:
    summary_df = df.groupby(["seed", "train_num"], as_index=False).agg(
        dict(
            status=["count"],
            acc=["count", "mean", "min", "max"],
            auc=["mean", "min", "max"],
        )
    )

    # multi-columns to single-columns
    renamed_colnames = ["_".join(x).strip("_") for x in summary_df.columns]
    summary_df.columns = summary_df.columns.droplevel(0)
    summary_df.columns = renamed_colnames
    summary_df = summary_df.rename(columns=dict(status_count="n_trials", acc_count="n_succeeses"))

    return summary_df


def code_model_execution_trial_count_summary(df: pd.DataFrame) -> pd.DataFrame:
    df = df.comment.value_counts().to_frame().T.loc[:, sorted(df.comment.unique())].reset_index(drop=True)
    df.insert(0, "n_trials", df.sum(axis=1).values[0])
    return df


def code_model_execution_trial_count_each_case(df: pd.DataFrame) -> pd.DataFrame:
    df = (
        df.groupby(["seed", "train_num", "comment"], as_index=False)
        .agg(dict(model_name="count"))
        .rename(columns=dict(model_name="n_events"))
        .pivot(index=["seed", "train_num"], columns="comment", values="n_events")
        .fillna("0")
        .astype(int)
        .loc[:, sorted(df.comment.unique())]
    )
    df.insert(0, "n_trials", df.sum(axis=1).values[0])
    return df.reset_index()

## save dataframe

In [5]:
def save_raw_results(df: pd.DataFrame, output_dir: str) -> None:
    df = df.sort_values(["seed", "train_num", "auc"], ascending=[True, True, False])
    df.to_csv(f"{output_dir}/raw_results.tsv", sep="\t", index=False)


def save_metrics_summary(df: pd.DataFrame, output_dir: str) -> None:
    df.to_csv(f"{output_dir}/metrics_summary.tsv", sep="\t", index=False)


def save_code_model_execution_trial_count_summary(df: pd.DataFrame, output_dir: str) -> None:
    df.to_csv(f"{output_dir}/code_model_execution_trial_count_summary.tsv", sep="\t", index=False)


def save_code_model_execution_trial_count_each_case(df: pd.DataFrame, output_dir: str) -> None:
    df.to_csv(f"{output_dir}/code_model_execution_trial_count_each_case.tsv", sep="\t", index=False)

## main

### pseudodata

In [6]:
model_name = "pseudodata"
seeds = [3655, 3656, 3657]
train_nums = [10, 20, 30, 40, 50, 100, 200, 300]
target_colname = "target"

output_dir = Path(f"../data/code_model_evaluation/{model_name}")
output_dir.mkdir(exist_ok=True, parents=True)

df = evaluate_all_code_models(model_name, seeds, train_nums, target_colname)

metrics_summary_df = metrics_summary(df)
code_model_execution_trial_count_summary_df = code_model_execution_trial_count_summary(df)
code_model_execution_trial_count_each_case_df = code_model_execution_trial_count_each_case(df)

save_raw_results(df, output_dir)
save_metrics_summary(metrics_summary_df, output_dir)
save_code_model_execution_trial_count_summary(code_model_execution_trial_count_summary_df, output_dir)
save_code_model_execution_trial_count_each_case(code_model_execution_trial_count_each_case_df, output_dir)

  0%|          | 0/3 [00:00<?, ?it/s]

[0.9 0.1 0.9 0.9 0.9 0.9 0.1 0.1 0.9 0.1]




[0.1 0.9 0.1 0.9 0.9 0.1 0.9 0.1 0.9 0.1]
[-0.775   0.1375  0.29    0.46    0.035   0.0525  0.495   0.0775  0.005
  0.36  ]
[0.1 0.9 0.1 0.9 0.9]




[0.9 0.1 0.1 0.1 0.9]


100%|██████████| 3/3 [00:11<00:00,  3.86s/it]

Done!





### moon

In [7]:
model_name = "moon"
seeds = [3655, 3656, 3657]
train_nums = [10, 20, 30, 40, 50, 100, 200, 300]
target_colname = "target"

output_dir = Path(f"../data/code_model_evaluation/{model_name}")
output_dir.mkdir(exist_ok=True, parents=True)

df = evaluate_all_code_models(model_name, seeds, train_nums, target_colname)

metrics_summary_df = metrics_summary(df)
code_model_execution_trial_count_summary_df = code_model_execution_trial_count_summary(df)
code_model_execution_trial_count_each_case_df = code_model_execution_trial_count_each_case(df)

save_raw_results(df, output_dir)
save_metrics_summary(metrics_summary_df, output_dir)
save_code_model_execution_trial_count_summary(code_model_execution_trial_count_summary_df, output_dir)
save_code_model_execution_trial_count_each_case(code_model_execution_trial_count_each_case_df, output_dir)

  0%|          | 0/3 [00:00<?, ?it/s]

[1 0 0 0 1 0 1 1 1 1]
[0 1 0 0 1 0 1 1 0 1]
[0 1 0 0 1 0 1 1 0 1]
[1 0 0 0 1 0 1 1 1 1]
[0 1 0 0 1 0 1 1 0 1]
[0 1 0 0 1 0 1 1 0 1]
[0 1 0 0 1 0 1 1 0 1]
[0 1 0 0 1 0 1 1 0 1]
[0 1 0 0 1 0 1 1 0 1]
[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1]
[0.5577 0.7497 0.5702 0.366  0.9298 0.539  1.     0.8448 0.5689 0.8568
 0.5572 0.6231 0.5372 0.4642 0.5375 0.2235 0.552  0.3165 1.     0.8081]
[1 0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0]
[1 0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0]
[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1]
[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1]
[0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1]
[1 0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0]
[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1]
[1 0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0]
[1 0 0 0 1]
[0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0]
[0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0]
[0.9 0.1 0.1 0.1 0.9]
[0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0]
[1 0 0 0 1 0 1 1 1 1]


 33%|███▎      | 1/3 [00:00<00:00,  3.28it/s]

[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413 ]
[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413 ]
[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413 ]
[0.63353216 0.75121447 0.5367337  0.527572   0.68851802 0.54800171
 0.74453986 0.60956879 0.68477421 0.54673815]




[1 0 0 0 1 0 0 1 1 0]
[1 0 0 0 1 0 0 1 1 0]
[1 0 0 0 1 0 0 1 1 0]
[0.72412177 0.84097493 0.63830156 0.6630688  0.77643314 0.63876318
 0.83623812 0.6983599  0.77381857 0.68836788]
[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413 ]
[1 0 0 0 1 0 0 1 1 0]
[0.9 0.1 0.1 0.1 0.9 0.1 0.1 0.9 0.9 0.1]
[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413 ]
[0.9 0.1 0.1 0.1 0.9]
[0.63353216 0.75121447 0.5367337  0.527572   0.68851802 0.54800171
 0.74453986 0.60956879 0.68477421 0.54673815 0.55840714 0.52403147
 0.65443692 0.55492743 0.53414676 0.66991486 0.62024777 0.75134527
 0.59124149 0.51929042]
[1 0 0 0 1]
[0.985  1.     0.7388 0.4397 1.    ]
[0.985  1.     0.7388 0.4397 1.    ]
[1 1 1 1 1]
[0.70620016 0.70339139 0.61396266 0.4610789  0.72829712]
[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413
 0.8065 0.598  1.     0.8273 0.462  1.     0.9836 1.     0.8784 0.5087]
[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413



[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413
 0.8065 0.598  1.     0.8273 0.462  1.     0.9836 1.     0.8784 0.5087]
[0.985  1.     0.7388 0.4397 1.    ]
[0.80780341 0.81663297 0.73582878 0.61711011 0.82662582 0.77236258
 0.81105808 0.80478914 0.82289706 0.60348325 0.75766128 0.68360742
 0.8201258  0.76489386 0.62611832 0.82822509 0.80833074 0.8110121
 0.77871609 0.64746161]
[0.985  1.     0.7388 0.4397 1.     0.8485 1.     0.9687 1.     0.413 ]
[0.9 0.1 0.1 0.1 0.9]
[0.985  1.     0.7388 0.4397 1.    ]
[0.985  1.     0.7388 0.4397 1.    ]




[1 0 1 0 1]
[1 0 1 0 1]




[1 0 1 0 1]
[1 1 1 1 1]


 67%|██████▋   | 2/3 [00:01<00:00,  1.45it/s]

[0.9517 0.9828 1.     0.5555 0.9606 0.4271 0.5682 0.8743 0.5058 0.938 ]
[0.9517 0.9828 1.     0.5555 0.9606 0.4271 0.5682 0.8743 0.5058 0.938 ]
[0.68664916 0.71818978 0.84110862 0.64106741 0.70286954 0.64702781
 0.64611374 0.66273361 0.63887854 0.69423634]
[1 1 0 1 1 0 1 1 0 1]
[0.569 0.871 1.    0.16  0.722 0.212 0.204 0.351 0.141 0.64 ]
[1 1 0 0 1 0 0 1 0 1]
[0.80064013 0.80755488 0.81932781 0.66674507 0.80221562 0.61308534
 0.67141679 0.77897446 0.6467537  0.79621439]
[0.68664916 0.71818978 0.84110862 0.64106741 0.70286954 0.64702781
 0.64611374 0.66273361 0.63887854 0.69423634]


100%|██████████| 3/3 [00:01<00:00,  1.88it/s]

[1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1]
[0.9517 0.9828 1.     0.5555 0.9606]
Done!





### titanic

In [8]:
model_name = "titanic"
seeds = [3655, 3656, 3657]
train_nums = [6, 8, 10, 20, 30, 40, 50]
target_colname = "survived"

output_dir = Path(f"../data/code_model_evaluation/{model_name}")
output_dir.mkdir(exist_ok=True, parents=True)

df = evaluate_all_code_models(model_name, seeds, train_nums, target_colname)

metrics_summary_df = metrics_summary(df)
code_model_execution_trial_count_summary_df = code_model_execution_trial_count_summary(df)
code_model_execution_trial_count_each_case_df = code_model_execution_trial_count_each_case(df)

save_raw_results(df, output_dir)
save_metrics_summary(metrics_summary_df, output_dir)
save_code_model_execution_trial_count_summary(code_model_execution_trial_count_summary_df, output_dir)
save_code_model_execution_trial_count_each_case(code_model_execution_trial_count_each_case_df, output_dir)

100%|██████████| 3/3 [00:06<00:00,  2.30s/it]

Done!





### circle

In [9]:
model_name = "circle"
seeds = [""]
train_nums = [""]
target_colname = "Target"

output_dir = Path(f"../data/code_model_evaluation/{model_name}")
output_dir.mkdir(exist_ok=True, parents=True)

df = evaluate_all_code_models_circle(model_name, seeds, train_nums, target_colname)

metrics_summary_df = metrics_summary(df)
code_model_execution_trial_count_summary_df = code_model_execution_trial_count_summary(df)
code_model_execution_trial_count_each_case_df = code_model_execution_trial_count_each_case(df)

save_raw_results(df, output_dir)
save_metrics_summary(metrics_summary_df, output_dir)
save_code_model_execution_trial_count_summary(code_model_execution_trial_count_summary_df, output_dir)
save_code_model_execution_trial_count_each_case(code_model_execution_trial_count_each_case_df, output_dir)

100%|██████████| 1/1 [00:00<00:00,  5.39it/s]

Done!





### text

In [10]:
model_name = "text"
seeds = [""]
train_nums = [""]
target_colname = "Target"

output_dir = Path(f"../data/code_model_evaluation/{model_name}")
output_dir.mkdir(exist_ok=True, parents=True)

df = evaluate_all_code_models_text(model_name, seeds, train_nums, target_colname)

metrics_summary_df = metrics_summary(df)
code_model_execution_trial_count_summary_df = code_model_execution_trial_count_summary(df)
code_model_execution_trial_count_each_case_df = code_model_execution_trial_count_each_case(df)

save_raw_results(df, output_dir)
save_metrics_summary(metrics_summary_df, output_dir)
save_code_model_execution_trial_count_summary(code_model_execution_trial_count_summary_df, output_dir)
save_code_model_execution_trial_count_each_case(code_model_execution_trial_count_each_case_df, output_dir)

100%|██████████| 1/1 [02:21<00:00, 141.07s/it]

Done!



