# Pseudo dataset

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/IBLM/blob/main/examples/iblbagging/pseudodata_openai.ipynb)

In [1]:
%%capture
%pip install iblm --upgrade

In [2]:
import pandas as pd
import numpy as np

from iblm import IBLBaggingModel

from sklearn.datasets import make_classification

### Preparing data sets

In [3]:
num_train_data = 10000 # 50〜300
num_test_data = 1000
sample = num_train_data + num_test_data
n_informative = 2
n_redundant = 0
n_features = n_informative + n_redundant
weights = [0.5, 0.5]
flip_y=0
seed = 3655  # 3655,3656,3657

# testデータの個数を揃えるためにtrain_test_splitを使っていない
x, y = make_classification(
    n_samples = sample,  # データ数
    n_features = n_features,  # 特徴量の数
    n_informative = n_informative,  # ラベル予測に意味のある特徴量の数
    n_redundant = n_redundant,  # 冗長な特徴量
    weights = weights,  # [0,1]の割合
    flip_y = flip_y, # 逆のラベルに反転する割合
    random_state = seed
    )

x = np.round(x, decimals=3)
x = pd.DataFrame(x)
y = y.astype(int)

x_test = x[0:num_test_data]
x_train = x[num_test_data:]

y_test = y[:num_test_data]
y_train = y[num_test_data:]

### IBLbagging setting

In [6]:
iblbagging = IBLBaggingModel(
    api_type="openai",
    model_name="gpt-4-0125-preview",
    objective="binary",
    num_model=20,
    max_sample = 2000,
    min_sample = 300,
    )

In [7]:
code_models = iblbagging.fit(
    x_train,
    y_train,
    temperature=0.2
    )

2024-03-10 18:41:38,941 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:41:47,975 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:42:01,886 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:42:13,703 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:42:28,133 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:42:46,786 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:43:00,170 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:43:10,694 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:43:20,536 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:43:39,094 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:43:49,498 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:44:00,119 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:44:15,677 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:44:26,336 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:44:40,737 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-10 18:45:06,749 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)
2024-03-

In [8]:
code_models

[('model_2',
  {'code_model': '#########\ndef predict(x):\n    import numpy as np\n\n    df = x.copy()\n    output = []\n    for index, row in df.iterrows():\n        # Do not change the code before this point.\n        # Please describe the process required to make the prediction below.\n        \n        # Simple logistic regression coefficients, manually estimated from the dataset\n        # These values are placeholders and should ideally be determined using a logistic regression model fitting\n        # However, as per the instructions, we are not using any existing machine learning models for predictions.\n        intercept = 0.5  # Intercept (bias)\n        coef_0 = 0.25  # Coefficient for the first feature\n        coef_1 = 0.75  # Coefficient for the second feature\n        \n        # Logistic regression formula to estimate probability\n        # p = 1 / (1 + e^-(intercept + coef_0*x0 + coef_1*x1))\n        log_odds = intercept + coef_0*row[0] + coef_1*row[1]\n        probabi

In [9]:
top_model = 20

for i in range(1, top_model):
    y_pred = iblbagging.predict_(x_test, i)
    print(iblbagging.evaluate(y_test, y_pred))

{'roc_auc': 0.90496, 'pr_auc': 0.878409, 'accuracy': 0.811, 'recall': 0.943907, 'precision': 0.753086, 'f1_score': 0.837768}
{'roc_auc': 0.900645, 'pr_auc': 0.86965, 'accuracy': 0.76, 'recall': 0.959381, 'precision': 0.693706, 'f1_score': 0.805195}
{'roc_auc': 0.897666, 'pr_auc': 0.864396, 'accuracy': 0.73, 'recall': 0.967118, 'precision': 0.664011, 'f1_score': 0.787402}
{'roc_auc': 0.895583, 'pr_auc': 0.861315, 'accuracy': 0.7, 'recall': 0.972921, 'precision': 0.637516, 'f1_score': 0.770291}
{'roc_auc': 0.894134, 'pr_auc': 0.85927, 'accuracy': 0.691, 'recall': 0.974855, 'precision': 0.63, 'f1_score': 0.765376}
{'roc_auc': 0.894578, 'pr_auc': 0.86045, 'accuracy': 0.801, 'recall': 0.941973, 'precision': 0.742378, 'f1_score': 0.83035}
{'roc_auc': 0.893893, 'pr_auc': 0.859456, 'accuracy': 0.781, 'recall': 0.94971, 'precision': 0.717836, 'f1_score': 0.817652}
{'roc_auc': 0.893317, 'pr_auc': 0.858581, 'accuracy': 0.765, 'recall': 0.953578, 'precision': 0.700284, 'f1_score': 0.807535}
{'roc_