# Titanic dataset
* Get sample data [here](https://github.com/fuyu-quant/IBLM/tree/main/datasets).

In [1]:
%%capture
!pip install git+https://github.com/fuyu-quant/IBLM.git

In [2]:
import pkg_resources
print(pkg_resources.get_distribution('IBLM').version)

0.1.1


In [4]:
import pandas as pd
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from iblm import IBLMClassifier


import os
#os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [5]:
n = 8

df = pd.read_csv(f'../data/titanic/titanic_{n}_train.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']
print(len(x_train))

8


In [6]:
x_train

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,...,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,2,27.0,0,0,10.5,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,True
1,2,7.0,0,2,26.25,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
2,2,29.0,1,0,26.0,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
3,3,27.0,0,0,7.7958,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
4,3,20.0,0,0,9.225,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
5,3,21.0,0,0,8.4333,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
6,2,28.0,0,0,0.0,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
7,3,24.0,0,0,8.85,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True


In [7]:
llm_model_name = 'gpt-4'

params = {'columns_name': True}

iblm = IBLMClassifier(llm_model_name=llm_model_name, params=params)

### Training

In [8]:
file_path = '../models/titanic/'

print(f'Number of data:{len(x_train)}')
model = iblm.fit(x_train, y_train, model_name = 'titanic', file_path=file_path)

Number of data:8
> Start of model creating.
Tokens Used: 1357
	Prompt Tokens: 815
	Completion Tokens: 542
Successful Requests: 1
Total Cost (USD): $0.05696999999999999


In [10]:
# Code of the model created
print(model)

import numpy as np

def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        
        # Calculate the probability based on the given data
        pclass = row['pclass']
        age = row['age']
        sibsp = row['sibsp']
        parch = row['parch']
        fare = row['fare']
        sex_female = row['sex_female']
        sex_male = row['sex_male']
        embarked_C = row['embarked_C']
        embarked_Q = row['embarked_Q']
        embarked_S = row['embarked_S']
        alive_no = row['alive_no']
        alive_yes = row['alive_yes']
        alone_False = row['alone_False']
        alone_True = row['alone_True']
        adult_male_False = row['adult_male_False']
        adult_male_True = row['adult_male_True']
        who_child = row['who_child']
        who_man = row['who_man']
        who_woman = row['who_woman']
        class_First = row['class_First']
        class_Second = row['class_Second

### Prediction

In [11]:
df = pd.read_csv(f'../data/titanic/titanic_{n}_test.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

In [12]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.7859569648924122
Precision: 0.7350157728706624
Recall: 0.6893491124260355
F1 score: 0.7114503816793892
ROC-AUC: 0.8445876988219966


### Creating Multiple Models

In [14]:
n = 8

df = pd.read_csv(f'../data/titanic/titanic_{n}_train.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']

file_path = '../models/titanic/'
for i in range(1,31):
    model = iblm.fit(x_train, y_train, model_name = f'titanic_{n}_{i}', file_path=file_path)

> Start of model creating.
Tokens Used: 1134
	Prompt Tokens: 815
	Completion Tokens: 319
Successful Requests: 1
Total Cost (USD): $0.043590000000000004
> Start of model creating.
Tokens Used: 1326
	Prompt Tokens: 815
	Completion Tokens: 511
Successful Requests: 1
Total Cost (USD): $0.05511
> Start of model creating.
Tokens Used: 1197
	Prompt Tokens: 815
	Completion Tokens: 382
Successful Requests: 1
Total Cost (USD): $0.047369999999999995
> Start of model creating.
Tokens Used: 1407
	Prompt Tokens: 815
	Completion Tokens: 592
Successful Requests: 1
Total Cost (USD): $0.059969999999999996
> Start of model creating.
Tokens Used: 1638
	Prompt Tokens: 815
	Completion Tokens: 823
Successful Requests: 1
Total Cost (USD): $0.07382999999999999
> Start of model creating.
Tokens Used: 1421
	Prompt Tokens: 815
	Completion Tokens: 606
Successful Requests: 1
Total Cost (USD): $0.060809999999999996
> Start of model creating.
Tokens Used: 1293
	Prompt Tokens: 815
	Completion Tokens: 478
Successful Re

### Prediction

In [15]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'other_dir'))

In [19]:
import models.titanic.titanic_8_1 as codemodel

In [None]:
codemodel.predict()

In [27]:
df = pd.read_csv(f'../data/titanic/titanic_{n}_test.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

In [None]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)


In [60]:

i = 3
import_file = f'import models.titanic.titanic_8_{i} as codemodel'

exec(import_file)

y_proba = codemodel.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [61]:
y_proba

array([-1.23085938, -0.59898477, -1.17026562, -0.88832488, -1.22539062,
       -1.21554687, -1.14748437, -1.18798438, -1.18798438, -0.85617597,
       -1.17026562, -0.99166062, -1.22539062, -1.22320312, -1.14748437,
       -1.13837187, -1.16317812, -1.20176562, -1.18798438, -1.10647812,
       -1.18247188, -1.20176562, -1.10647812, -1.13837188, -1.13837187,
       -1.16317812, -1.20570312, -1.13837187, -1.13837188, -1.22539062,
       -1.04906937, -0.59898477, -1.13837188, -1.21554687, -1.04906937,
       -1.17026562, -1.20570312, -1.22539062, -1.18798438, -1.10647812,
       -1.18798438, -1.07982912, -1.20570312, -0.85617597, -1.13837188,
       -1.22539062, -1.22320312, -1.13837188, -1.20176562, -1.18798438,
       -1.18798438, -1.22539062, -0.78498912, -1.07982912, -1.10647812,
       -1.13837188, -1.09372063, -1.20570312, -1.03120887, -1.18798438,
       -1.20570312, -0.78498912, -1.17026562, -1.18798438, -1.04906937,
       -1.21554687, -1.09372063, -1.22539062, -1.18798438, -1.23

In [62]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.6172140430351076
Precision: 0.0
Recall: 0.0
F1 score: 0.0
ROC-AUC: 0.843206123446067
