# Titanic problem

## Use naive approch -- DecisionTree

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
def get_surname(item):
    return item['Name'].split(',')[0]

def get_n_members(item):
    return (item['SibSp']+item['Parch'] +
           1) # self

def get_passengers_ticket_numbers(passengers):
    tickets = {p['Ticket'] for p in passengers}
    if len(tickets) <= 1:
        return tickets.pop()
    else:
        return tickets

# Holders of close ticket numbers are likely to be family members.
def family_like(item, fam_members):
    ticket = item['Ticket']
    for member in fam_members:
        mem_ticket = member['Ticket']
        if ticket == mem_ticket:
            return True
        if ticket.isdigit() and mem_ticket.isdigit() and abs(int(ticket) - int(mem_ticket)) <= 2:
            return True
    return False

def group_families(df):
    families = {}

    for i, (column_name, item) in enumerate(df.iterrows()):
        fam_name = get_surname(item)
        ticket_num = item['Ticket']
        fam_dict = families.setdefault(fam_name, {})
        for cnt in range(100):
            fam_name_mod = f'{fam_name}#{cnt}'
            if fam_name_mod in fam_dict:
                if family_like(item, fam_dict[fam_name_mod]):
                    fam_dict[fam_name_mod].append(item)
                    break
            else:
                fam_dict.setdefault(fam_name_mod, []).append(item)
                break
    return families

def extract_alone_passengers(df, families):
    alone_indices = []
    for fam_name, subfamilies in families.items():
        for _, passengers in subfamilies.items():
            if len(passengers) <= 1:
                alone_indices.append(passengers[0]['PassengerId'])
    return df[df['PassengerId'].isin(alone_indices)]

families = group_families(train_df)
train_df = extract_alone_passengers(train_df, families)

In [None]:
train_df

## Convert data into numerals

In [None]:
# Age's 19.9% data are missing. Drop them.
train_df = train_df.dropna(subset=['Age'])
train_df['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
# use one-hot encoding
train_df = pd.get_dummies(train_df, columns=['Embarked'])

In [None]:
X_df = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y_df = train_df[['Survived']]
X_keys = list(X_df.keys())
X_values = X_df.values
y_values = np.squeeze(y_df.values)
len(X_values)

## Prepare the training set and the test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_values, y_values, test_size=0.3, random_state=1, stratify=y_values)

## Naive approch: use Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_text

tree = DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=0)
tree.fit(X_train, y_train);

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = tree.predict(X_test)

# confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy
print(tree.score(X_test, y_test))
print(np.trace(conf_mat) / len(y_test))

## Show the tree's detail

In [None]:
r = export_text(tree, feature_names=X_keys)
print(r)

## Cross validation

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

acc_product = []
models = []

#cv = KFold(n_splits=5, shuffle=True, random_state=0)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for fold_id, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
    X_tr  = X_train[train_idx, :]
    X_val = X_train[val_idx, :]
    y_tr  = y_train[train_idx]
    y_val = y_train[val_idx]

    tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
    tree.fit(X_tr, y_tr)
    val_acc  = round(tree.score(X_val, y_val), 2)
    test_acc = round(tree.score(X_test, y_test), 2)
    acc_product.append(val_acc * test_acc)
    models.append(tree)
    print(f'[{fold_id}] val acc={val_acc}, test acc={test_acc}')

In [None]:
best_idx = np.argmax(acc_product)
best_model = models[best_idx]
print(f'{best_idx=}')

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

r = export_text(best_model, feature_names=X_keys)
print(r)