# Pseudo dataset
* Get sample data [here](https://github.com/fuyu-quant/IBLM/tree/main/datasets).

In [None]:
%%capture
!pip install iblm --upgrade

In [1]:
%%capture
!pip install git+https://github.com/fuyu-quant/IBLM.git

In [2]:
import pkg_resources
print(pkg_resources.get_distribution('IBLM').version)

0.0.21


### Training

In [3]:
import pandas as pd
import numpy as np
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from iblm import IBLMClassifier

In [7]:
columns = 4
train_data = 10

file_name = f'pseudodata_{columns}f_{train_data}_train.csv'


df = pd.read_csv(f'../data/pseudodata/{file_name}')
x_train = df.drop(df.columns[-1], axis=1)
y_train = df[df.columns[-1]]

In [8]:
llm_model_name = 'gpt-4'

params = {'columns_name': True}

iblm = IBLMClassifier(llm_model_name=llm_model_name, params=params)

In [16]:
file_path = '../models/pseudodata/'

model = iblm.fit(x_train, y_train, model_name = 'pseudodata', file_path=file_path)

> Start of model creating.
Tokens Used: 875
	Prompt Tokens: 716
	Completion Tokens: 159
Successful Requests: 1
Total Cost (USD): $0.03102


In [17]:
# Code of the model created
print(model)

import numpy as np

def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.

        # Calculate the weighted sum of the features
        weighted_sum = row['a'] * 0.1 + row['b'] * 0.2 + row['c'] * 0.3 + row['d'] * 0.4

        # Apply a sigmoid function to the weighted sum to get the probability
        y = 1 / (1 + np.exp(-weighted_sum))

        # Do not change the code after this point.
        output.append(y)
    return np.array(output)


### Prediction

In [13]:
columns = 4
train_data = 10

file_name = f'pseudodata_{columns}f_{train_data}_test.csv'


df = pd.read_csv(f'../data/pseudodata/{file_name}')
x_test = df.drop(df.columns[-1], axis=1)
y_test = df[df.columns[-1]]

In [14]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [15]:
accuracy = round(accuracy_score(y_test, y_pred),4)
print(f'Accuracy: {accuracy}')

# Precision
precision = round(precision_score(y_test, y_pred),4)
print(f'Precision: {precision}')

# Recall
recall = round(recall_score(y_test, y_pred),4)
print(f'Recall: {recall}')

# F1 score
f1 = round(f1_score(y_test, y_pred),4)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = round(roc_auc_score(y_test, y_proba),4)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.5141
Precision: 0.5137
Recall: 0.5665
F1 score: 0.5388
ROC-AUC: 0.4878


### Creating Multiple Models

In [46]:
columns = 4
train_data = 10

file_name = f'pseudodata_{columns}f_{train_data}_train.csv'


df = pd.read_csv(f'../data/pseudodata/{file_name}')
x_train = df.drop(df.columns[-1], axis=1)
y_train = df[df.columns[-1]]

file_path = '../models/pseudodata/'
for i in range(1,31):
    model = iblm.fit(x_train, y_train, model_name = f'pseudodata_{columns}f_{train_data}_{i}', file_path=file_path)

> Start of model creating.
Tokens Used: 7629
	Prompt Tokens: 7470
	Completion Tokens: 159
Successful Requests: 1
Total Cost (USD): $0.23364
> Start of model creating.
Tokens Used: 7629
	Prompt Tokens: 7470
	Completion Tokens: 159
Successful Requests: 1
Total Cost (USD): $0.23364
> Start of model creating.
Tokens Used: 7618
	Prompt Tokens: 7470
	Completion Tokens: 148
Successful Requests: 1
Total Cost (USD): $0.23298
> Start of model creating.
Tokens Used: 7629
	Prompt Tokens: 7470
	Completion Tokens: 159
Successful Requests: 1
Total Cost (USD): $0.23364
> Start of model creating.
Tokens Used: 7642
	Prompt Tokens: 7470
	Completion Tokens: 172
Successful Requests: 1
Total Cost (USD): $0.23442
> Start of model creating.
Tokens Used: 7599
	Prompt Tokens: 7470
	Completion Tokens: 129
Successful Requests: 1
Total Cost (USD): $0.23184
> Start of model creating.
Tokens Used: 7629
	Prompt Tokens: 7470
	Completion Tokens: 159
Successful Requests: 1
Total Cost (USD): $0.23364
> Start of model cre