# Titanic problem

## Use naive approch -- DecisionTree

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
def get_surname(item):
    return item['Name'].split(',')[0]

def get_n_members(item):
    return (item['SibSp']+item['Parch'] +
           1) # self

def get_passengers_ticket_numbers(passengers):
    tickets = {p['Ticket'] for p in passengers}
    if len(tickets) <= 1:
        return tickets.pop()
    else:
        return tickets

# Holders of close ticket numbers are likely to be family members.
def family_like(item, fam_members):
    ticket = item['Ticket']
    for member in fam_members:
        mem_ticket = member['Ticket']
        if ticket == mem_ticket:
            return True
        if ticket.isdigit() and mem_ticket.isdigit() and abs(int(ticket) - int(mem_ticket)) <= 2:
            return True
    return False

def group_families(df):
    families = {}

    for i, (column_name, item) in enumerate(df.iterrows()):
        fam_name = get_surname(item)
        ticket_num = item['Ticket']
        fam_dict = families.setdefault(fam_name, {})
        for cnt in range(100):
            fam_name_mod = f'{fam_name}#{cnt}'
            if fam_name_mod in fam_dict:
                if family_like(item, fam_dict[fam_name_mod]):
                    fam_dict[fam_name_mod].append(item)
                    break
            else:
                fam_dict.setdefault(fam_name_mod, []).append(item)
                break
    return families

def split_single_family_passengers(df, families):
    single_indices = set()
    family_indices = set()
    for fam_name, subfamilies in families.items():
        for _, passengers in subfamilies.items():
            if len(passengers) <= 1:
                single_indices.add(passengers[0]['PassengerId'])
            else:
                for p in passengers:
                    family_indices.add(p['PassengerId'])
    return df[df['PassengerId'].isin(sorted(single_indices))], df[df['PassengerId'].isin(sorted(family_indices))]

families = group_families(train_df)
single_df, family_df = split_single_family_passengers(train_df, families)

In [4]:
single_df[:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [5]:
family_df[:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S


## Prepare data

In [6]:
def prepare_data(df):
    # This makes df['Sex'] viewing instead of copying...?
    df = pd.DataFrame(df)
    df = df.dropna(subset=['Age'])
    df['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
    df = pd.get_dummies(df, columns=['Embarked'])

    X_df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    y_df = df[['Survived']]
    X_keys = list(X_df.keys())
    X_values = X_df.values
    y_values = np.squeeze(y_df.values)

    X_train, X_test, y_train, y_test = train_test_split(
        X_values, y_values, test_size=0.3, random_state=1, stratify=y_values)
    return X_train, X_test, y_train, y_test, X_keys

## Naive approch: use Decision Tree

In [7]:
def train_model(X_train, y_train, X_test, y_test):
    acc_product = []
    models = []

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    for fold_id, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_tr  = X_train[train_idx, :]
        X_val = X_train[val_idx, :]
        y_tr  = y_train[train_idx]
        y_val = y_train[val_idx]

        tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
        tree.fit(X_tr, y_tr)
        val_acc  = round(tree.score(X_val, y_val), 2)
        test_acc = round(tree.score(X_test, y_test), 2)
        acc_product.append(val_acc * test_acc)
        models.append(tree)
        print(f'[{fold_id}] val acc={val_acc}, test acc={test_acc}')

    best_idx = np.argmax(acc_product)
    best_model = models[best_idx]
    print(f'{best_idx=}')
    return best_model

## Train and evaluation for single

In [8]:
X_train_s, X_test_s, y_train_s, y_test_s, feature_names = prepare_data(single_df)
model = train_model(X_train_s, y_train_s, X_test_s, y_test_s)
y_pred = model.predict(X_test_s)
conf_mat = confusion_matrix(y_test_s, y_pred)
print(conf_mat)
print('acc:', np.trace(conf_mat) / len(y_test_s))

r = export_text(model, feature_names=feature_names)
print(r)

[0] val acc=0.82, test acc=0.79
[1] val acc=0.78, test acc=0.78
[2] val acc=0.7, test acc=0.79
[3] val acc=0.85, test acc=0.79
[4] val acc=0.85, test acc=0.82
best_idx=4
[[79 11]
 [15 40]]
acc: 0.8206896551724138
|--- Sex <= 0.50
|   |--- Fare <= 26.14
|   |   |--- Age <= 5.71
|   |   |   |--- class: 1
|   |   |--- Age >  5.71
|   |   |   |--- class: 0
|   |--- Fare >  26.14
|   |   |--- Age <= 43.50
|   |   |   |--- class: 1
|   |   |--- Age >  43.50
|   |   |   |--- class: 0
|--- Sex >  0.50
|   |--- Pclass <= 2.50
|   |   |--- Fare <= 29.36
|   |   |   |--- class: 1
|   |   |--- Fare >  29.36
|   |   |   |--- class: 1
|   |--- Pclass >  2.50
|   |   |--- Age <= 34.00
|   |   |   |--- class: 1
|   |   |--- Age >  34.00
|   |   |   |--- class: 0



## Train and evaluation for family

In [9]:
X_train_f, X_test_f, y_train_f, y_test_f, feature_names = prepare_data(family_df)
model = train_model(X_train_f, y_train_f, X_test_f, y_test_f)
y_pred = model.predict(X_test_f)
conf_mat = confusion_matrix(y_test_f, y_pred)
print(conf_mat)
print('acc:', np.trace(conf_mat) / len(y_test_f))

r = export_text(model, feature_names=feature_names)
print(r)

[0] val acc=0.85, test acc=0.84
[1] val acc=0.79, test acc=0.83
[2] val acc=0.76, test acc=0.83
[3] val acc=0.78, test acc=0.84
[4] val acc=0.84, test acc=0.84
best_idx=0
[[35  2]
 [ 9 24]]
acc: 0.8428571428571429
|--- Pclass <= 2.50
|   |--- Sex <= 0.50
|   |   |--- Age <= 17.50
|   |   |   |--- class: 1
|   |   |--- Age >  17.50
|   |   |   |--- class: 0
|   |--- Sex >  0.50
|   |   |--- Fare <= 135.78
|   |   |   |--- class: 1
|   |   |--- Fare >  135.78
|   |   |   |--- class: 1
|--- Pclass >  2.50
|   |--- Fare <= 20.55
|   |   |--- Age <= 11.75
|   |   |   |--- class: 1
|   |   |--- Age >  11.75
|   |   |   |--- class: 0
|   |--- Fare >  20.55
|   |   |--- Fare <= 31.33
|   |   |   |--- class: 0
|   |   |--- Fare >  31.33
|   |   |   |--- class: 0

