# Titanic problem

## Use LightGBM

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
def get_surname(item):
    return item['Name'].split(',')[0]

def get_n_members(item):
    return (item['SibSp']+item['Parch'] +
           1) # self

def get_passengers_ticket_numbers(passengers):
    tickets = {p['Ticket'] for p in passengers}
    if len(tickets) <= 1:
        return tickets.pop()
    else:
        return tickets

# Holders of close ticket numbers are likely to be family members.
def family_like(item, fam_members):
    ticket = item['Ticket']
    for member in fam_members:
        mem_ticket = member['Ticket']
        if ticket == mem_ticket:
            return True
        if ticket.isdigit() and mem_ticket.isdigit() and abs(int(ticket) - int(mem_ticket)) <= 2:
            return True
    return False

def group_families(df):
    families = {}

    for i, (column_name, item) in enumerate(df.iterrows()):
        fam_name = get_surname(item)
        ticket_num = item['Ticket']
        fam_dict = families.setdefault(fam_name, {})
        for cnt in range(100):
            fam_name_mod = f'{fam_name}#{cnt}'
            if fam_name_mod in fam_dict:
                if family_like(item, fam_dict[fam_name_mod]):
                    fam_dict[fam_name_mod].append(item)
                    break
            else:
                fam_dict.setdefault(fam_name_mod, []).append(item)
                break
    return families

def split_single_family_passengers(df, families):
    single_indices = set()
    family_indices = set()
    for fam_name, subfamilies in families.items():
        for _, passengers in subfamilies.items():
            if len(passengers) <= 1:
                single_indices.add(passengers[0]['PassengerId'])
            else:
                for p in passengers:
                    family_indices.add(p['PassengerId'])
    return df[df['PassengerId'].isin(sorted(single_indices))], df[df['PassengerId'].isin(sorted(family_indices))]

families = group_families(train_df)
single_df, family_df = split_single_family_passengers(train_df, families)

## Prepare data

In [4]:
def prepare_data(df):
    # This makes df['Sex'] viewing instead of copying...?
    df = pd.DataFrame(df)
    df = df.dropna(subset=['Age'])
    df['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
    df = pd.get_dummies(df, columns=['Embarked'])

    X_df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    y_df = df[['Survived']]
    X_keys = list(X_df.keys())
    X_values = X_df.values
    y_values = np.squeeze(y_df.values)

    X_train, X_test, y_train, y_test = train_test_split(
        X_values, y_values, test_size=0.3, random_state=1, stratify=y_values)
    return X_train, X_test, y_train, y_test, X_keys

## Use LightGBM

In [5]:
def prob2index(prob):
    return np.array([np.argmax(ps) for ps in prob])

def train_model(X_train, y_train):
    params = {
        'objective':'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 2,
        'max_depth': 3,
        'eta': 0.1,
        'verbosity': 0,
        'random_state': 71,
    }
    
    params = {
        'objective':'multiclass',
        'metric': 'multi_logloss',
        'num_class': 2,
        'max_depth': 3,
        'learning_rate': 0.1,
        'verbosity': -1,
        'early_stopping_round': 10,
    }
    
    accs = []
    models = []

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    for fold_id, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_tr  = X_train[train_idx, :]
        X_val = X_train[val_idx, :]
        y_tr  = y_train[train_idx]
        y_val = y_train[val_idx]
        dtrain = lgb.Dataset(X_tr, label=y_tr)
        dvalid = lgb.Dataset(X_val, label=y_val)

        callbacks = [
            lgb.log_evaluation(0),
            lgb.early_stopping(10),
        ]
        gbm = lgb.train(params, dtrain, num_boost_round=10, valid_sets=dvalid, callbacks=callbacks)
        y_pred = prob2index(gbm.predict(X_val))
        conf_mat = confusion_matrix(y_val, y_pred)
        val_acc = np.trace(conf_mat) / len(y_val)
        accs.append(val_acc)
        models.append(gbm)
        print(f'[{fold_id}] val acc={val_acc}')

    best_idx = np.argmax(accs)
    best_model = models[best_idx]
    print(f'{best_idx=}')
    return best_model

## Train and evaluation for single

In [6]:
X_train_s, X_test_s, y_train_s, y_test_s, _ = prepare_data(single_df)
model = train_model(X_train_s, y_train_s)
y_pred = prob2index(model.predict(X_test_s))
conf_mat = confusion_matrix(y_test_s, y_pred)
print(conf_mat)
print('acc:', np.trace(conf_mat) / len(y_test_s))

Training until validation scores don't improve for 10 rounds
[0] val acc=0.8235294117647058
Training until validation scores don't improve for 10 rounds
[1] val acc=0.7761194029850746
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[10]	valid_0's multi_logloss: 0.521598
[2] val acc=0.746268656716418
Training until validation scores don't improve for 10 rounds
[3] val acc=0.835820895522388
Training until validation scores don't improve for 10 rounds
[4] val acc=0.8507462686567164
best_idx=4
[[84  6]
 [27 28]]
acc: 0.7724137931034483


## Train and evaluation for family

In [7]:
X_train_f, X_test_f, y_train_f, y_test_f, _ = prepare_data(family_df)
model = train_model(X_train_f, y_train_f)
y_pred = prob2index(model.predict(X_test_f))
conf_mat = confusion_matrix(y_test_f, y_pred)
print(conf_mat)
print('acc:', np.trace(conf_mat) / len(y_test_f))

Training until validation scores don't improve for 10 rounds
[0] val acc=0.7575757575757576
Training until validation scores don't improve for 10 rounds
[1] val acc=0.7272727272727273
Training until validation scores don't improve for 10 rounds
[2] val acc=0.8181818181818182
Training until validation scores don't improve for 10 rounds
[3] val acc=0.84375
Training until validation scores don't improve for 10 rounds
[4] val acc=0.6875
best_idx=3
[[35  2]
 [ 8 25]]
acc: 0.8571428571428571
