In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# importing pandas_profiling changes backend of matplotlib to Agg
#import pandas_profiling

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.keys()

In [None]:
#train_df.profile_report()

In [None]:
def get_surname(item):
    return item['Name'].split(',')[0]

def get_n_members(item):
    return (item['SibSp']+item['Parch'] +
           1) # self

def get_ticker_numbers(passengers):
    tickets = set()
    for p in passengers:
        tickets.add(p['Ticket'])
    if len(tickets) <= 1:
        return tickets.pop()
    else:
        return tickets

## Show families infos by only surnames

In [None]:
df_concat = pd.concat([train_df, test_df])

families = {}

for i, (column_name, item) in enumerate(df_concat.iterrows()):
    fam_name = get_surname(item)
    # XXX: The credibility of n_members is questionable. There may be some typos...
    n_members = get_n_members(item)
    if n_members <= 1:
        for cnt in range(100):
            fam_name_mod = f'{fam_name}#{cnt}'
            if fam_name_mod not in families:
               families[fam_name_mod] = [item]
    else:
        families.setdefault(fam_name, []).append(item)

for fam_name, passengers in families.items():
    n_member = len(passengers)
    if n_member <= 1:
        continue
    n_member2 = get_n_members(passengers[0])
    ticker_numbers = get_ticker_numbers(passengers)
    print(f'{fam_name=}, {n_member=}, {n_member2=} {ticker_numbers=}')

## Show families infos by surnames and ticker numbers

In [None]:
def family_like(item, fam_members):
    ticket = item['Ticket']
    for member in fam_members:
        mem_ticket = member['Ticket']
        if ticket == mem_ticket:
            return True
        if ticket.isdigit() and mem_ticket.isdigit() and abs(int(ticket) - int(mem_ticket)) <= 2:
            return True
    return False

families = {}

for i, (column_name, item) in enumerate(train_df.iterrows()):
    fam_name = get_surname(item)
    ticket_num = item['Ticket']
    fam_dict = families.setdefault(fam_name, {})
    for cnt in range(100):
        fam_name_mod = f'{fam_name}#{cnt}'
        if fam_name_mod in fam_dict:
            if family_like(item, fam_dict[fam_name_mod]):
                fam_dict[fam_name_mod].append(item)
                break
        else:
            fam_dict.setdefault(fam_name_mod, []).append(item)
            break

for fam_name, subfamilies in families.items():
    for _, passengers in subfamilies.items():
        if len(passengers) <= 1:
            continue
        print(f'[{fam_name}]')
        for p in passengers:
            ticket = p['Ticket']
            n_member = get_n_members(p)
            age = p['Age']
            survived = int(p['Survived'])
            name = p['Name']
            if len(name) >= 40:
                name = name[:40] + '...'
            print(f' {ticket=}, {n_member=}, {age=}, {survived=}, {name=}')

## Drop infos without ages

In [None]:
print(len(train_df[train_df['Age'].isnull()]), len(train_df))
print(f"Age: {round(len(train_df[train_df['Age'].isnull()])/len(train_df)*100, 2)}% are NaN")

# drop if the age is null etc.
train_df = train_df.dropna(subset=['Age'])
print(len(train_df))

## Fill null Embarked and show infos

In [None]:
#train_df['Age'].fillna(train_df['Age'].mean().round(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode().values[0], inplace=True)
for k, v in train_df.items():
    if k in ['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin']:
        continue
    print(k, set(v))

In [None]:
sub_df = train_df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [None]:
plt.hist(sub_df.loc[sub_df['Survived']==0, 'Age'], bins=30, alpha=0.5, label='0', color='#ffbb00')
plt.hist(sub_df.loc[sub_df['Survived']==1, 'Age'], bins=30, alpha=0.5, label='1', color='#ff00ff')
plt.xlabel('Age')
plt.ylabel('count')
plt.legend(title='Survived')
plt.show()

In [None]:
sns.countplot(x='SibSp', hue='Survived', data=sub_df)
plt.legend(loc='upper right', title='Survuved')
plt.show()

## Show infos with Pclass == 1

In [None]:
sub_df[sub_df['Pclass']==1][:30]

## Show Pclass count infos by percentage

In [None]:
df2 = (
    sub_df.groupby('Sex')['Pclass']
    .value_counts(normalize=True)
    .rename("percentage")
    .reset_index()
)

df2

In [None]:
sns.catplot(
    x='Pclass',
    y='percentage',
    col_order=['male', 'female'],
    order=[1, 2, 3],
    col='Sex',
    data=df2,
    kind='bar',
)