# Titanic data set

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/IBLM/blob/main/examples/iblmodel/iblmodel_titanic.ipynb)

In [1]:
%%capture
!pip install iblm --upgrade

In [2]:
import pandas as pd
from langchain.llms import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns

from iblm import IBLModel

import os
#os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

### Preparing data sets

In [3]:
df = sns.load_dataset('titanic')

sample = 30
sample_num = int(sample/2)
seed = 3655

df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

df = pd.get_dummies(df, columns=['sex'])
df = pd.get_dummies(df, columns=['embarked'])
df = pd.get_dummies(df, columns=['alive'])
df = pd.get_dummies(df, columns=['alone'])
df = pd.get_dummies(df, columns=['adult_male'])
df = pd.get_dummies(df, columns=['who'])
df = pd.get_dummies(df, columns=['class'])
df = pd.get_dummies(df, columns=['deck'])
df = pd.get_dummies(df, columns=['embark_town'])
df = df.replace({True: 1, False: 0})

cols = list(df.columns)
cols.remove('survived')
cols.append('survived')
df = df[cols]

df_1 = df[df['survived'] == 1].sample(n = sample_num, random_state = seed)
df_0 = df[df['survived'] == 0].sample(n = sample_num, random_state = seed)

df_train = pd.DataFrame()
df_len = len(df_1)
for i in range(df_len):
    df1 = pd.DataFrame([df_1.iloc[i]])
    df0 = pd.DataFrame([df_0.iloc[i]])
    df_train = pd.concat([df_train, df1, df0])

df_train['survived'] = df_train['survived'].astype(int)
df_test = df.drop(df_train.index)
df_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,...,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,survived
717,2.0,27.0,0.0,0.0,10.5,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
682,3.0,20.0,0.0,0.0,9.225,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
535,2.0,7.0,0.0,2.0,26.25,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
652,3.0,21.0,0.0,0.0,8.4333,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
133,2.0,29.0,1.0,0.0,26.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [4]:
x_train = df_train.drop('survived', axis=1)
y_train = df_train['survived']

### Training

In [5]:
llm_model = OpenAI(temperature=0, model_name = 'gpt-4-0613')

params = {
    'columns_name': True,
    'objective': 'classification',
    }

iblm = IBLModel(llm_model = llm_model, params=params)

In [6]:
#file_path = '/content/'

#model = iblm.fit(x_train, y_train, model_name = 'titanic', file_path=file_path)
model = iblm.fit(x_train, y_train)

Please create your code in compliance with all of the following conditions. 
・Analyze the data given below and write python code to predict the probability that the "target" of the unknown data is 1.
・Never use machine learning algorithms.
・Only Python functions should be output.
・Please come up with a logic that allows you to predict probability values as closely as possible.
・For the data given below, create a code that predicts a high probability value when "target" is likely to be 1 and a low probability value when "target" is unlikely to be 1.
・The 'target' value cannot be used for forecasting.
・Analyze the data in as much detail as possible.
・Each data type is float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, float64, int64.
・The column names, in order, a

In [7]:
# Code Model
print(model)

import numpy as np

def predict(x):
    df = x.copy()
    
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.
        
        # Rule 1: If the passenger is female, the probability of survival is high.
        if row['sex_female'] == 1.0:
            y = 0.75
        # Rule 2: If the passenger is male and in the first class, the probability of survival is medium.
        elif row['sex_male'] == 1.0 and row['class_First'] == 1.0:
            y = 0.5
        # Rule 3: If the passenger is male and in the second or third class, the probability of survival is low.
        elif row['sex_male'] == 1.0 and (row['class_Second'] == 1.0 or row['class_Third'] == 1.0):
            y = 0.25
        # Rule 4: For all other cases, the probability of survival is very low.
        else:
            y = 0.1

        # Do not change the code after this point.
        output.ap

### Prediction

In [8]:
x_test = df_test.drop('survived', axis=1)
y_test = df_test['survived']

In [9]:
y_proba = iblm.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.7851335656213705
Precision: 0.7366666666666667
Recall: 0.6758409785932722
F1 score: 0.7049441786283891
ROC-AUC: 0.798815127879142
