# Titanic dataset
* Get sample data [here](https://github.com/fuyu-quant/IBLM/tree/main/datasets).

In [1]:
%%capture
!pip install git+https://github.com/fuyu-quant/IBLM.git@feature-in-context-learning 

In [1]:
import pkg_resources
print(pkg_resources.get_distribution('IBLM').version)

0.1.28


In [8]:
from dotenv import load_dotenv

# .env ファイルを読み込む
load_dotenv(override=True)

True

In [48]:
import pandas as pd
import numpy as np
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from iblm import IBLMClassifier

import sys
import os

In [10]:
n = 8

df = pd.read_csv(f'../data/titanic/titanic_{n}_train.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']
print(len(x_train))

8


In [11]:
x_train

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,...,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,2,27.0,0,0,10.5,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,True
1,2,7.0,0,2,26.25,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
2,2,29.0,1,0,26.0,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
3,3,27.0,0,0,7.7958,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
4,3,20.0,0,0,9.225,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
5,3,21.0,0,0,8.4333,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
6,2,28.0,0,0,0.0,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
7,3,24.0,0,0,8.85,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True


In [12]:
llm_model = OpenAI(temperature=0, model_name = 'gpt-4')

params = {'columns_name': True}

iblm = IBLMClassifier(llm_model = llm_model, params=params)

### Training

In [18]:
file_path = '../models/titanic/'

print(f'Number of data:{len(x_train)}')
model = iblm.fit(x_train, y_train, model_name = 'titanic', file_path=file_path)

Number of data:8
> Start of model creating.
Tokens Used: 1173
	Prompt Tokens: 842
	Completion Tokens: 331
Successful Requests: 1
Total Cost (USD): $0.04511999999999999


In [19]:
# Code of the model created
print(model)

import numpy as np
def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.

        # Calculate the probability based on the given features
        pclass_factor = 0.9 if row['pclass'] == 1 else 0.7 if row['pclass'] == 2 else 0.5
        age_factor = 0.9 if row['age'] <= 16 else 0.7 if row['age'] <= 40 else 0.5
        fare_factor = 0.9 if row['fare'] >= 50 else 0.7 if row['fare'] >= 20 else 0.5
        sex_factor = 0.9 if row['sex_female'] else 0.5
        embarked_factor = 0.9 if row['embarked_C'] else 0.7 if row['embarked_Q'] else 0.5
        alone_factor = 0.9 if row['alone_True'] else 0.7

        # Combine the factors to calculate the final probability
        y = pclass_factor * age_factor * fare_factor * sex_factor * embarked_factor * alone_factor

        # Normalize the probability to be between 0 and 1
        y = (y 

### Prediction

In [20]:
df = pd.read_csv(f'../data/titanic/titanic_{n}_test.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

In [21]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [22]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.6172140430351076
Precision: 0.0
Recall: 0.0
F1 score: 0.0
ROC-AUC: 0.8160930459801314


### Creating Multiple Models

In [14]:
n = 8

df = pd.read_csv(f'../data/titanic/titanic_{n}_train.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']

file_path = '../models/titanic/'
for i in range(1,31):
    model = iblm.fit(x_train, y_train, model_name = f'titanic_{n}_{i}', file_path=file_path)

> Start of model creating.
Tokens Used: 1134
	Prompt Tokens: 815
	Completion Tokens: 319
Successful Requests: 1
Total Cost (USD): $0.043590000000000004
> Start of model creating.
Tokens Used: 1326
	Prompt Tokens: 815
	Completion Tokens: 511
Successful Requests: 1
Total Cost (USD): $0.05511
> Start of model creating.
Tokens Used: 1197
	Prompt Tokens: 815
	Completion Tokens: 382
Successful Requests: 1
Total Cost (USD): $0.047369999999999995
> Start of model creating.
Tokens Used: 1407
	Prompt Tokens: 815
	Completion Tokens: 592
Successful Requests: 1
Total Cost (USD): $0.059969999999999996
> Start of model creating.
Tokens Used: 1638
	Prompt Tokens: 815
	Completion Tokens: 823
Successful Requests: 1
Total Cost (USD): $0.07382999999999999
> Start of model creating.
Tokens Used: 1421
	Prompt Tokens: 815
	Completion Tokens: 606
Successful Requests: 1
Total Cost (USD): $0.060809999999999996
> Start of model creating.
Tokens Used: 1293
	Prompt Tokens: 815
	Completion Tokens: 478
Successful Re

### Prediction

In [50]:
df = pd.read_csv(f'../data/titanic/titanic_{n}_test.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

sys.path.append('..')

error_count = 0
error_list = []
auc_list = []


for i in range(1,31):

    import_file = f'import models.titanic.titanic_8_{i} as codemodel'

    exec(import_file)

    try:
        y_proba = codemodel.predict(x_test)
        y_pred = (y_proba > 0.5).astype(int)
        negative_values_exist = np.any(y_proba < 0)
        values_greater_than_one_exist = np.any(y_proba > 1)
        if negative_values_exist:
            error_list.append(i)
            error_count += 1
            print(f"Negative values exist：{negative_values_exist}")

        elif values_greater_than_one_exist:
            error_list.append(i)
            error_count += 1
            print(f"Positive values exist：{values_greater_than_one_exist}")

        else:
            roc_auc = roc_auc_score(y_test, y_proba)
            auc_list.append(roc_auc)
        
    except Exception:
        print('run error')
        error_count += 1
        error_list.append(i)
        pass

print(error_count)
print(error_list)
print(auc_list)
average = sum(auc_list) / len(auc_list)
print("Average Value:", average)
max_value = max(auc_list)
min_value = min(auc_list)
print("Maximum Value:", max_value)
print("Minimum Value:", min_value)

Negative values exist：True
Negative values exist：True
Negative values exist：True
Negative values exist：True
Negative values exist：True
Negative values exist：True
run error
Negative values exist：True
Negative values exist：True
Negative values exist：True
Negative values exist：True
11
[1, 2, 3, 10, 13, 15, 17, 21, 24, 26, 28]
[0.8354812442321264, 0.9638836110960316, 0.8514494327126648, 0.8304109440312686, 0.8228054937299821, 0.829645513272895, 0.838610824602356, 0.2832283806525162, 0.7730850659573312, 0.7464388469681341, 0.8428532652950438, 0.8340236686390533, 0.8228054937299821, 0.8391238260680745, 0.8338309538027252, 0.8228054937299821, 0.8571494489984256, 0.8331903805439443, 0.8432712664893329]
Average Value: 0.8054785870816774
Maximum Value: 0.9638836110960316
Minimum Value: 0.2832283806525162
