# Pseudo dataset

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/IBLM/blob/main/examples/iblbagging/pseudodata_claude.ipynb)

In [1]:
%%capture
%pip install iblm --upgrade

In [2]:
import pandas as pd
import numpy as np

from iblm import IBLBaggingModel

from sklearn.datasets import make_classification

### Preparing data sets

In [3]:
num_train_data = 10000 # 50〜300
num_test_data = 1000
sample = num_train_data + num_test_data
n_informative = 2
n_redundant = 0
n_features = n_informative + n_redundant
weights = [0.5, 0.5]
flip_y=0
seed = 3655  # 3655,3656,3657

# testデータの個数を揃えるためにtrain_test_splitを使っていない
x, y = make_classification(
    n_samples = sample,  # データ数
    n_features = n_features,  # 特徴量の数
    n_informative = n_informative,  # ラベル予測に意味のある特徴量の数
    n_redundant = n_redundant,  # 冗長な特徴量
    weights = weights,  # [0,1]の割合
    flip_y = flip_y, # 逆のラベルに反転する割合
    random_state = seed
    )

x = np.round(x, decimals=3)
x = pd.DataFrame(x)
y = y.astype(int)

x_test = x[0:num_test_data]
x_train = x[num_test_data:]

y_test = y[:num_test_data]
y_train = y[num_test_data:]

### IBLbagging setting

In [7]:
iblbagging = IBLBaggingModel(
    api_type="claude",
    model_name="claude-3-opus-20240229",
    objective="binary",
    num_model=5,
    max_sample = 2000,
    min_sample = 300,
    )

In [8]:
code_models = iblbagging.fit(
    x_train,
    y_train,
    temperature=0.2
    )

2024-03-10 20:23:17,492 [iblm.ibl][INFO] (ibl:ibl.py:fit:167)
2024-03-10 20:23:36,506 [iblm.ibl][INFO] (ibl:ibl.py:fit:167)
2024-03-10 20:24:06,756 [iblm.ibl][INFO] (ibl:ibl.py:fit:167)
2024-03-10 20:24:25,459 [iblm.ibl][INFO] (ibl:ibl.py:fit:167)
2024-03-10 20:25:09,084 [iblm.ibl][INFO] (ibl:ibl.py:fit:167)


In [9]:
code_models

[('model_3',
  {'code_model': 'def predict(x):\n    import numpy as np\n\n    df = x.copy()\n    output = []\n    for index, row in df.iterrows():\n        # Do not change the code before this point.\n        # Please describe the process required to make the prediction below.\n        x1 = row[0]\n        x2 = row[1]\n\n        # Check if point is in upper right quadrant \n        if x1 > 0 and x2 > 0:\n            # Points in this region are very likely label 1\n            y = 0.85\n        # Check if point is in lower right quadrant\n        elif x1 > 0 and x2 < 0:\n            # Points in this region are somewhat likely label 0 \n            y = 0.3\n        # Check if point is far in the lower left quadrant\n        elif x1 < -1.5 and x2 < -1.5:\n            # Points in this region are very likely label 0\n            y = 0.05\n        # Check if point is in the upper left quadrant\n        elif x1 < 0 and x2 > 1:\n            # Points in this region are very likely label 1\n    

In [14]:
top_model = 6

for i in range(1, top_model):
    y_pred = iblbagging.predict_(x_test, i)
    print(iblbagging.evaluate(y_test, y_pred))

{'roc_auc': 0.858282, 'pr_auc': 0.840579, 'accuracy': 0.819, 'recall': 0.727273, 'precision': 0.903846, 'f1_score': 0.806002}
{'roc_auc': 0.879683, 'pr_auc': 0.845724, 'accuracy': 0.873, 'recall': 0.880077, 'precision': 0.875, 'f1_score': 0.877531}
{'roc_auc': 0.881293, 'pr_auc': 0.838539, 'accuracy': 0.873, 'recall': 0.880077, 'precision': 0.875, 'f1_score': 0.877531}
{'roc_auc': 0.898699, 'pr_auc': 0.897651, 'accuracy': 0.875, 'recall': 0.88588, 'precision': 0.874046, 'f1_score': 0.879923}
{'roc_auc': 0.899256, 'pr_auc': 0.896666, 'accuracy': 0.848, 'recall': 0.814313, 'precision': 0.8826, 'f1_score': 0.847082}
