# Pseudo data set

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/IBLM/blob/main/examples/iblmodel/iblmodel_pseudodata.ipynb)

In [None]:
%%capture
!pip install iblm --upgrade

In [1]:
import numpy as np
import pandas as pd
import string
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.datasets import make_classification

from iblm import IBLModel

import os
#os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

### Preparing data sets

In [100]:
columns = 4
train_data = 80
sampling = int(train_data/2)
sample = 400
seed = 3658

X, y = make_classification(n_samples=sample, n_features=columns, random_state=seed)
X = np.round(X, 2)
y = np.round(y, 2)

column_name = [letter for letter in string.ascii_lowercase[:columns]] 

df = pd.DataFrame(X, columns = column_name)
df['target'] = y 

df_1 = df[df['target'] == 1].sample(n=sampling, random_state=seed)
df_0 = df[df['target'] == 0].sample(n=sampling, random_state=seed)

df_train = pd.DataFrame()
df_len = len(df_1)
for i in range(df_len):
    df1 = pd.DataFrame([df_1.iloc[i]])
    df0 = pd.DataFrame([df_0.iloc[i]])
    df_train = pd.concat([df_train, df1, df0])

df_train['target'] = df_train['target'].astype(int)
df_test = df.drop(df_train.index)

df_train.head()

Unnamed: 0,a,b,c,d,target
28,1.16,1.38,0.16,1.87,1
9,-0.75,-0.71,0.05,-1.03,0
105,0.64,-1.0,-1.35,-0.69,1
90,-0.6,-0.5,0.1,-0.76,0
160,1.0,-0.63,-1.34,-0.16,1


In [101]:
x_train = df_train.drop('target', axis=1)
y_train = df_train['target']

## Training

In [102]:
llm_model = OpenAI(temperature=0, model_name = 'gpt-4-0613')

params = {'columns_name': True}
iblm = IBLModel(llm_model = llm_model, params=params, mode = 'classification')

In [103]:
#file_path = '/content/'

#model = iblm.fit(x_train, y_train, model_name = 'pseudodata', file_path=file_path)
model = iblm.fit(x_train, y_train)

Tokens Used: 2400
	Prompt Tokens: 2001
	Completion Tokens: 399
Successful Requests: 1
Total Cost (USD): $0.08396999999999999


In [104]:
# Code Model
print(model)

Here is a simple Python code that uses a basic logistic regression model to predict the probability that the "target" of the unknown data is 1. This code does not use any existing machine learning model, but rather implements the logistic regression model from scratch.

```python
import numpy as np
import pandas as pd

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.
        z = row['a'] + row['b'] + row['c'] + row['d']
        y = sigmoid(z)
        # Do not change the code after this point.
        output.append(y)
    return np.array(output)
```

This code first defines a helper function `sigmoid(x)` that implements the sigmoid function, which is used in logistic regression to map any real-valued number into the range [0, 1]. This function is then used in the `predict(x)

### Prediction

In [98]:
x_test = df_test.drop('target', axis=1)
y_test = df_test['target']

In [99]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

SyntaxError: invalid syntax (<string>, line 1)

In [None]:
accuracy = round(accuracy_score(y_test, y_pred),4)
print(f'Accuracy: {accuracy}')

# Precision
precision = round(precision_score(y_test, y_pred),4)
print(f'Precision: {precision}')

# Recall
recall = round(recall_score(y_test, y_pred),4)
print(f'Recall: {recall}')

# F1 score
f1 = round(f1_score(y_test, y_pred),4)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = round(roc_auc_score(y_test, y_proba),4)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.16
Precision: 0.165
Recall: 0.1717
F1 score: 0.1683
ROC-AUC: 0.0738
