# Titanic dataset
* Get sample data [here](https://github.com/fuyu-quant/IBLM/tree/main/datasets).

In [1]:
%%capture
!pip install git+https://github.com/fuyu-quant/IBLM.git@feature-in-context-learning 

In [2]:
import pkg_resources
print(pkg_resources.get_distribution('IBLM').version)

0.1.45


In [3]:
from dotenv import load_dotenv

# .env ファイルを読み込む
load_dotenv(override=True)

True

In [30]:
import pandas as pd
import numpy as np
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from iblm import IBLMClassifier

import sys
import os

In [41]:
n = 20

df = pd.read_csv(f'../data/titanic/titanic_{n}_train.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']
print(len(x_train))

20


In [42]:
llm_model = OpenAI(temperature=0, model_name = 'gpt-4')

params = {'columns_name': True}

iblm = IBLMClassifier(llm_model = llm_model, params=params)

### Training

In [43]:
file_path = '../models/titanic/'

model = iblm.fit(x_train, y_train, model_name = 'titanic', file_path=file_path)

Tokens Used: 3673
	Prompt Tokens: 3195
	Completion Tokens: 478
Successful Requests: 1
Total Cost (USD): $0.12452999999999999


In [44]:
# Code of the model created
print(model)

import numpy as np

def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        
        # Calculate the probability based on the given data
        pclass = row['pclass']
        age = row['age']
        fare = row['fare']
        sex_female = row['sex_female']
        embarked_C = row['embarked_C']
        embarked_Q = row['embarked_Q']
        alive_yes = row['alive_yes']
        alone_True = row['alone_True']
        adult_male_True = row['adult_male_True']
        who_child = row['who_child']
        who_woman = row['who_woman']
        class_First = row['class_First']
        class_Second = row['class_Second']
        deck_A = row['deck_A']
        deck_B = row['deck_B']
        deck_C = row['deck_C']
        deck_D = row['deck_D']
        deck_E = row['deck_E']
        deck_F = row['deck_F']
        embark_town_Cherbourg = row['embark_town_Cherbourg']
        embark_town_Queenstown = row['emba

### Prediction

In [48]:
df = pd.read_csv(f'../data/titanic/titanic_{n}_test.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

In [49]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [50]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.7577497129735936
Precision: 0.6531645569620254
Recall: 0.7771084337349398
F1 score: 0.7097661623108665
ROC-AUC: 0.8275895790956032


### Creating Multiple Models

In [51]:
n = 20

df = pd.read_csv(f'../data/titanic/titanic_{n}_train.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']

file_path = '../models/titanic/'
for i in range(1,31):
    model = iblm.fit(x_train, y_train, model_name = f'titanic_{n}_{i}', file_path=file_path)

Tokens Used: 4019
	Prompt Tokens: 3195
	Completion Tokens: 824
Successful Requests: 1
Total Cost (USD): $0.14528999999999997
Tokens Used: 3814
	Prompt Tokens: 3195
	Completion Tokens: 619
Successful Requests: 1
Total Cost (USD): $0.13299
Tokens Used: 3481
	Prompt Tokens: 3195
	Completion Tokens: 286
Successful Requests: 1
Total Cost (USD): $0.11301
Tokens Used: 3794
	Prompt Tokens: 3195
	Completion Tokens: 599
Successful Requests: 1
Total Cost (USD): $0.13179
Tokens Used: 4349
	Prompt Tokens: 3195
	Completion Tokens: 1154
Successful Requests: 1
Total Cost (USD): $0.16509


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-4 in organization org-rKmEAn8qrwuz778vK8jNCcB5 on tokens per min. Limit: 40000 / min. Please try again in 1ms. Contact us through our help center at help.openai.com if you continue to have issues..


Tokens Used: 3665
	Prompt Tokens: 3195
	Completion Tokens: 470
Successful Requests: 1
Total Cost (USD): $0.12405


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..


Tokens Used: 3549
	Prompt Tokens: 3195
	Completion Tokens: 354
Successful Requests: 1
Total Cost (USD): $0.11708999999999999


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 10.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..


RateLimitError: You exceeded your current quota, please check your plan and billing details.

### Prediction

In [None]:
df = pd.read_csv(f'../data/titanic/titanic_{n}_test.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

sys.path.append('..')

error_count = 0
error_list = []
auc_list = []
acc_list = []


for i in range(1,31):

    import_file = f'import models.titanic.titanic_8_{i} as codemodel'

    exec(import_file)

    try:
        y_proba = codemodel.predict(x_test)
        y_pred = (y_proba > 0.5).astype(int)
        negative_values_exist = np.any(y_proba < 0)
        values_greater_than_one_exist = np.any(y_proba > 1)
        if negative_values_exist:
            error_list.append(i)
            error_count += 1
            print(f"Negative values exist：{negative_values_exist}")

        elif values_greater_than_one_exist:
            error_list.append(i)
            error_count += 1
            print(f"Positive values exist：{values_greater_than_one_exist}")

        else:
            roc_auc = roc_auc_score(y_test, y_proba)
            accuracy = round(accuracy_score(y_test, y_pred),4)
            auc_list.append(roc_auc)
            acc_list.append(accuracy)
        
    except Exception:
        print('run error')
        error_count += 1
        error_list.append(i)
        pass

print('-----------error-------------')
print(error_count)
print(error_list)

print('-----------AUC-------------')
print(auc_list)
average = sum(auc_list) / len(auc_list)
print("Average Value:", average)
max_value = max(auc_list)
min_value = min(auc_list)
print("Maximum Value:", max_value)
print("Minimum Value:", min_value)

print('-----------ACC-------------')
print(acc_list)
average = sum(acc_list) / len(acc_list)
print("Average Value:", average)
max_value = max(acc_list)
min_value = min(acc_list)
print("Maximum Value:", max_value)
print("Minimum Value:", min_value)

-----------error-------------
0
[]
-----------AUC-------------
[0.7781580804516585, 0.7201834862385321, 0.7011345746702135, 0.6221866348189565, 0.6571141631833235, 0.6347076705933445, 0.7459665599044569, 0.6207182020520059, 0.783798382281092, 0.6634438955539872, 0.5494679984799957, 0.32421964062754466, 0.6243797839422398, 0.783798382281092, 0.6320530915802616, 0.5802698007708593, 0.8802046577275936, 0.6046495847130993, 0.7011345746702135, 0.6634438955539872, 0.9127191791976549, 0.5522338635253243, 0.5295695130557516, 0.6953368438195539, 0.6634438955539872, 0.809323598067423, 0.7139758970739917, 0.5260599315998046, 0.5433798382281092, 0.7434422669779057]
Average Value: 0.6653505962397986
Maximum Value: 0.9127191791976549
Minimum Value: 0.32421964062754466
-----------ACC-------------
[0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.3828, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.6172, 0.7463