# Titanic data set

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/IBLM/blob/main/examples/iblmodel/iblmodel_titanic.ipynb)

In [None]:
%%capture
!pip install iblm --upgrade

In [3]:
import pandas as pd
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns

from iblm import IBLModel

import os
#os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

### Preparing data sets

In [4]:
df = sns.load_dataset('titanic')

sample = 40
sample_num = int(sample/2)
seed = 3656

df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

df = pd.get_dummies(df, columns=['sex'])
df = pd.get_dummies(df, columns=['embarked'])
df = pd.get_dummies(df, columns=['alive'])
df = pd.get_dummies(df, columns=['alone'])
df = pd.get_dummies(df, columns=['adult_male'])
df = pd.get_dummies(df, columns=['who'])
df = pd.get_dummies(df, columns=['class'])
df = pd.get_dummies(df, columns=['deck'])
df = pd.get_dummies(df, columns=['embark_town'])
df = df.replace({True: 1, False: 0})

cols = list(df.columns)
cols.remove('survived')
cols.append('survived')
df = df[cols]

df_1 = df[df['survived'] == 1].sample(n = sample_num, random_state = seed)
df_0 = df[df['survived'] == 0].sample(n = sample_num, random_state = seed)

df_train = pd.DataFrame()
df_len = len(df_1)
for i in range(df_len):
    df1 = pd.DataFrame([df_1.iloc[i]])
    df0 = pd.DataFrame([df_0.iloc[i]])
    df_train = pd.concat([df_train, df1, df0])

df_train['survived'] = df_train['survived'].astype(int)
df_test = df.drop(df_train.index)
df_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,...,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,survived
717,2.0,27.0,0.0,0.0,10.5,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
682,3.0,20.0,0.0,0.0,9.225,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
535,2.0,7.0,0.0,2.0,26.25,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
652,3.0,21.0,0.0,0.0,8.4333,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
133,2.0,29.0,1.0,0.0,26.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [5]:
x_train = df_train.drop('survived', axis=1)
y_train = df_train['survived']

### Training

In [6]:
llm_model = OpenAI(temperature=0, model_name = 'gpt-4-0613')

params = {
    'columns_name': True,
    'objective': 'classification',
    }

iblm = IBLModel(llm_model = llm_model, params=params)

In [7]:
#file_path = '/content/'

#model = iblm.fit(x_train, y_train, model_name = 'titanic', file_path=file_path)
model = iblm.fit(x_train, y_train)

Tokens Used: 6224
	Prompt Tokens: 5842
	Completion Tokens: 382
Successful Requests: 1
Total Cost (USD): $0.19817999999999997


In [9]:
# Code Model
print(model)

import numpy as np

def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.

        # Based on the given data, we can see that the survival rate is higher for females, people in first class, and those who embarked from Cherbourg.
        # We can also see that the survival rate is lower for males, people in third class, and those who embarked from Southampton.
        # We will use these observations to make our predictions.

        y = 0.5  # start with a base probability of 0.5

        # increase probability for females
        if row['sex_female'] == 1.0:
            y += 0.3

        # decrease probability for males
        if row['sex_male'] == 1.0:
            y -= 0.3

        # increase probability for first class
        if row['class_First'] == 1.0:
            y += 0.2

        # decrease probability for third clas

### Prediction

In [10]:
x_test = df_test.drop('survived', axis=1)
y_test = df_test['survived']

In [11]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.782608695652174
Precision: 0.7322033898305085
Recall: 0.6708074534161491
F1 score: 0.7001620745542951
ROC-AUC: 0.8375318484425083
