# Pseudo dataset

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/IBLM/blob/main/examples/iblmodel/pseudodat_openai.ipynb)

In [1]:
%%capture
%pip install iblm --upgrade

In [2]:
import os
import pandas as pd
import numpy as np

from iblm import IBLModel

from sklearn.datasets import make_classification

#os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

### Preparing data sets

In [3]:
num_train_data = 500 # 50〜300
num_test_data = 1000
sample = num_train_data + num_test_data
n_informative = 2
n_redundant = 0
n_features = n_informative + n_redundant
weights = [0.5, 0.5]
flip_y=0
seed = 3655  # 3655,3656,3657

x, y = make_classification(
    n_samples = sample,  # データ数
    n_features = n_features,  # 特徴量の数
    n_informative = n_informative,  # ラベル予測に意味のある特徴量の数
    n_redundant = n_redundant,  # 冗長な特徴量
    weights = weights,  # [0,1]の割合
    flip_y = flip_y, # 逆のラベルに反転する割合
    random_state = seed
    )

x = np.round(x, decimals=3)
x = pd.DataFrame(x)

x_test = x[0:num_test_data]
x_train = x[num_test_data:]

y_test = y[:num_test_data]
y_train = y[num_test_data:]

### GPT-4 Turbo

In [4]:
iblm = IBLModel(
    api_type="openai",
    model_name="gpt-4-0125-preview",
    objective="binary"
    )

* Learning

In [5]:
model = iblm.fit(x_train, y_train, seed=3655)

2024-03-10 18:36:25,035 [iblm.ibl][INFO] (ibl:ibl.py:fit:153)


In [6]:
print(model)

#########
def predict(x):
    import numpy as np

    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.
        
        # Simple logistic regression coefficients derived from the dataset manually or through a simple algorithm
        # These coefficients are placeholders and should ideally be calculated using a logistic regression model
        # trained on the dataset provided. This is a simplistic approach for demonstration purposes.
        intercept = 0.5  # Intercept (bias) of the logistic regression model
        coef_1 = 0.7    # Coefficient for the first feature
        coef_2 = 0.3    # Coefficient for the second feature
        
        # Logistic regression model equation
        log_odds = intercept + coef_1 * row[0] + coef_2 * row[1]
        
        # Sigmoid function to convert log odds to probability
        y = 1 / (1 + np.e

* prediction

In [7]:
y_pred = iblm.predict(x_test)

* Evaluation

In [8]:
iblm.evaluate(y_test, y_pred)

{'roc_auc': 0.975932,
 'pr_auc': 0.976062,
 'accuracy': 0.857,
 'recall': 0.996032,
 'precision': 0.780715,
 'f1_score': 0.875327}