# Pseudo data set

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/IBLM/blob/main/examples/iblmodel/iblmodel_pseudodata.ipynb)

In [1]:
%%capture
!pip install iblm --upgrade

In [2]:
import numpy as np
import pandas as pd
import string
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.datasets import make_classification

from iblm import IBLModel

import os
#os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

### Preparing data sets

In [3]:
columns = 4
train_data = 40
sampling = int(train_data/2)
seed = 3656

X, y = make_classification(n_samples=1000, n_features=columns, random_state=seed)
X = np.round(X, 2)
y = np.round(y, 2)

column_name = [letter for letter in string.ascii_lowercase[:columns]] 

df = pd.DataFrame(X, columns = column_name)
df['target'] = y 

df_1 = df[df['target'] == 1].sample(n=sampling, random_state=seed)
df_0 = df[df['target'] == 0].sample(n=sampling, random_state=seed)

df_train = pd.DataFrame()
df_len = len(df_1)
for i in range(df_len):
    df1 = pd.DataFrame([df_1.iloc[i]])
    df0 = pd.DataFrame([df_0.iloc[i]])
    df_train = pd.concat([df_train, df1, df0])

df_train['target'] = df_train['target'].astype(int)
df_test = df.drop(df_train.index)

df_train.head()

Unnamed: 0,a,b,c,d,target
755,-0.21,-0.63,0.57,0.98,1
384,0.78,0.32,-0.6,-1.0,0
692,0.48,-0.3,-0.0,0.03,1
650,1.05,-0.09,-0.43,-0.67,0
117,-0.73,-0.21,0.5,0.82,1


In [4]:
x_train = df_train.drop('target', axis=1)
y_train = df_train['target']

## Training

In [5]:
llm_model = OpenAI(temperature=0, model_name = 'gpt-4-0613')

params = {
    'columns_name': True,
    'objective': 'classification',
    }

iblm = IBLModel(llm_model = llm_model, params=params)

In [11]:
#file_path = '/content/'

#model = iblm.fit(x_train, y_train, model_name = 'pseudodata', file_path=file_path)
model = iblm.fit(x_train, y_train)

In [12]:
# Code Model
print(model)

import numpy as np
import pandas as pd

def predict(x):
    df = x.copy()
    
    output = []
    for index, row in df.iterrows():
        sum_row = row['a'] + row['b'] + row['c'] + row['d']
        if sum_row > 0:
            y = 0.9
        else:
            y = 0.1
        output.append(y)
    return np.array(output)

data = pd.DataFrame({
    'a': [-0.21, 0.78, 0.48, 1.05, -0.73],
    'b': [-0.63, 0.32, -0.3, -0.09, -0.21],
    'c': [0.57, -0.6, -0.0, -0.43, 0.5],
    'd': [0.98, -1.0, 0.03, -0.67, 0.82],
    'target': [1.0, 0.0, 1.0, 0.0, 1.0]
})

print(predict(data))


### Prediction

In [13]:
x_test = df_test.drop('target', axis=1)
y_test = df_test['target']

In [14]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

[0.9 0.1 0.9 0.1 0.9]


In [15]:
accuracy = round(accuracy_score(y_test, y_pred),4)
print(f'Accuracy: {accuracy}')

# Precision
precision = round(precision_score(y_test, y_pred),4)
print(f'Precision: {precision}')

# Recall
recall = round(recall_score(y_test, y_pred),4)
print(f'Recall: {recall}')

# F1 score
f1 = round(f1_score(y_test, y_pred),4)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = round(roc_auc_score(y_test, y_proba),4)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.9135
Precision: 0.8898
Recall: 0.9436
F1 score: 0.9159
ROC-AUC: 0.9136
