In [104]:
import kaggle
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import tree
from sklearn import ensemble
from sklearn.inspection import plot_partial_dependence
import matplotlib.pyplot as plt

In [105]:
def name_to_status(df):
    title_dict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Dona": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"}
    
    titles = []
    for name in df['Name']:
        title = name.split(',')[-1].split('.')[0]
        titles.append(title[1:])
    df['Name'] = titles
    
    
    # a map of more aggregated title
    # we map each title
    df['Name'] = df.Name.map(title_dict)
    df.rename(columns = {'Name':'Status'}, inplace = True)
    return df

In [106]:
def add_ticket_type(df):
    
    ticket_type_dict = {
        'SCA4': 'A',
        'SCA3': 'A',
        'A4': 'A',
        'A': 'A',
        'AQ3': 'A',
        'AQ4': 'A',
        'SP': 'A',
        'SOP': 'A',
        'FA': 'A',
        'SCOW': 'A',
        'AS': 'A',
        'SOPP': 'A',
        'FC': 'A',
        'SOTONO2': 'A', 
        'CASOTON': 'A',
        'A5': 'A',
        'WC': 'A',
        'SOTONOQ': 'A',
        'STONOQ': 'A',
        'PC': 'B',
        'STONO2': 'B',
        'PP': 'B',
        'SCPARIS': 'B',
        'CA': 'B',
        'SOC': 'B',
        'C': 'B',
        'FCC': 'B',
        'SWPP': 'B',
        'SC': 'B',
        'STONO': 'B',
        'SCAH': 'B',
        'WEP': 'B',
        'PPP': 'B',
        'LP': 'B',
        'BASIC': 'C',
    }

    ticket_types = []
    for ticket in df['Ticket']:
        if pd.notna(ticket): 
            ticket_split = ticket.split(' ')
            if len(ticket_split) > 1:
                ticket_type = ticket_split[0]
                ticket_type = ticket_type.replace('.','')
                ticket_type = ticket_type.replace('/','')
                ticket_type = ticket_type.upper()
                ticket_types.append(ticket_type)
            else: ticket_types.append('BASIC')
        else: ticket_types.append(ticket)
    df['TicketType'] = ticket_types
#     df['TicketType'] = df.TicketType.map(ticket_type_dict)
    return df

In [107]:
# def add_ticket_type(df):
#     ticket_types = []
#     for ticket in df['Ticket']:
#         if pd.notna(ticket): 
#             ticket_split = ticket.split(' ')
#             if len(ticket_split) > 1:
#                 ticket_types.append(ticket_split[0])
#             else: ticket_types.append('basic')
#         else: ticket_types.append(ticket)
#     df['TicketType'] = ticket_types
# #     df.rename(columns = {'Ticket':'TicketType'}, inplace = True)
#     return df

In [108]:
def cabin_to_deck(df):
    decks = []
    for cabin in df['Cabin']:
        if pd.notna(cabin): decks.append(cabin[0])
        else: decks.append('Basic')
    df['Cabin'] = decks
    df.rename(columns = {'Cabin':'Deck'}, inplace = True)
    return df

In [238]:
def add_ticket_number(df):
    ticket_numbers = []
    for ticket in df['Ticket']:
        if ticket == 'LINE': ticket_numbers.append(0)
        elif pd.notna(ticket):
            ticket_split = ticket.split(' ')
            ticket_numbers.append(int(ticket_split[-1]))
        else: ticket_numbers.append(ticket)
    ticket_numbers_arr = np.array(ticket_numbers) #Experiment
    uni, count = np.unique(ticket_numbers_arr,return_counts=True)
    aux = uni[count < 2].tolist()
    for i, value in enumerate(ticket_numbers_arr):
        if value in aux:
            ticket_numbers_arr[i] = 0
#     ticket_numbers_arr[ticket_numbers_arr < 2] = 0 #Experiment
    df['TicketNumber'] = ticket_numbers_arr #Experiment
    return df

In [239]:
def add_group_size(df):
    group_size = []
    for i, row in df.iterrows():
        group_size.append(len((df['TicketNumber'] == row['TicketNumber']).values.nonzero()[0]))
    df['GroupSize'] = group_size
    return df

In [240]:
def add_number_of_friends(df):
    number_of_friends = []
    for i, row in df.iterrows():
        number_of_friends.append(row['GroupSize'] - row['FamSize'])
    df['NumberOfFriends'] = number_of_friends
    return df

In [241]:
def add_age_estimated(df):
    age_estimated = []
    for i in range(len(df)):
        if df['Age'].values[i] != df['Age'].values[i]:
            age_estimated.append(0)
        elif df['Age'].values[i] != int(df['Age'].values[i]) and df['Age'].values[i] > 1:
            age_estimated.append(1)
        else: 
            age_estimated.append(0)
    df['AgeEstimated'] = age_estimated
    return df

In [242]:
def add_fam_size(df):
    fam_size  = []
    for i in range(len(df)):
        fam_size.append(df['SibSp'][i]+df['Parch'][i])
    df['FamSize'] = fam_size
    return df

In [243]:
def add_fare_per_person(df):
    fare_per_person = []
    for i in range(len(df)):
        fare_per_person.append(df['Fare'][i]/(df['FamSize'][i]+1))
    df['FarePerPerson'] = fare_per_person
    return df

In [244]:
def fill_ages(df, train_index):
    ages = []
    grouped_train = df.iloc[:train_index].groupby(['Sex','Pclass','Status'])
    grouped_median_train = grouped_train.median()
    grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Status', 'Age']]
    for i, row in df.iterrows():
        if row['Age'] != row['Age']: 
            condition = (
                (grouped_median_train['Sex'] == row['Sex']) & 
                (grouped_median_train['Status'] == row['Status']) & 
                (grouped_median_train['Pclass'] == row['Pclass'])
            ) 
            ages.append(grouped_median_train[condition]['Age'].values[0])
        else: ages.append(row['Age'])
    df['Age'] = ages
    return df

In [245]:
def add_age_times_class(df):
    age_times_class = []
    for i in range(len(df)):
        age_times_class.append(df['Pclass'][i]*df['Age'][i])
    df['AgeClass'] = age_times_class
    return df

In [246]:
def add_life_stage(df):
    minor = np.zeros(len(df), 'int')
    adult = np.zeros(len(df), 'int')
    elderly = np.zeros(len(df), 'int')

    for i, age in enumerate(df['Age'].values):
        if age < 18:
            minor[i] = 1
        elif age < 60:
            adult[i] = 1
        else:
            elderly[i] = 1

    df['Minor'] = minor
    df['Adult'] = adult
    df['Elderly'] = elderly
    return df

In [247]:
def add_fam_size_cat(df):
    singleton = np.zeros(len(df), 'int')
    small_fam = np.zeros(len(df), 'int')
    large_fam = np.zeros(len(df), 'int')

    for i, fam_size in enumerate(df['FamSize'].values):
        if fam_size == 1:
            singleton[i] = 1
        elif fam_size <= 4:
            small_fam[i] = 1
        else:
            large_fam[i] = 1

    df['Singleton'] = singleton
    df['SmallFam'] = small_fam
    df['LargeFam'] = large_fam
    return df

In [248]:
def categorify(df, columns):
    rectifier = 1
    for column in columns:
        category_dict = {}
        unique_categories = df[column].unique()
        unknown_index = (unique_categories != unique_categories).nonzero()[0]
        unique_categories = unique_categories.tolist()

        if len(unknown_index): 
            category_indexes = list(range(len(unique_categories)))
            unknown = unique_categories[unknown_index[0]]
            category_indexes[0] = unknown
            unique_categories.pop(unknown_index[0])
            unique_categories.insert(0,unknown)

        else:
            category_indexes = list(range(1,len(unique_categories)+1))

        for i in range(len(unique_categories)):
            category_dict[unique_categories[i]] = category_indexes[i]

        new_column = []
        for i in range(len(df)):
            new_value = category_dict[df[column][i]]
            if new_value == new_value: new_value = int(new_value)
            new_column.append(new_value)

        df[column] = new_column
#         df[column] = new_column.astype('category')
    return df

In [249]:
def fill_missing(df, ignore_columns):
    for column in df.columns:
        if column in ignore_columns: continue
        elif df[column].isna().any():
            median = df[column].describe()['50%']
            nan_indexes = df[column].isna().values
            new_column = np.array(df[column])
            new_column[nan_indexes] = median
#             new_column = new_column.astype('int')
            df[column] = new_column
            df[f'{column}Missing'] = nan_indexes 
    return df

In [250]:
def integerize(df, columns):
    for column in columns:
        df = df.astype({column: 'int'})
    return df

In [251]:
def metric(predictions, targets):
    return 100*(len((predictions == targets).nonzero()[0]) / len(predictions))

In [252]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
    ).sort_values('imp', ascending=False)

In [253]:
# def one_hot_encode(df, key):
#     for i in range(len(df[key].unique())):
#         df[f'{key}_{i+1}'] = np.zeros(len(df), 'int')

#     for i in range(len(df)):
#         value = df[key].values[i]
#         df[f'{key}_{value}'][i] = 1
#     df = df.drop([key], axis=1)
#     return df

In [254]:
def one_hot_encode(df, key):
    array_list = []
    for i in range(len(df[key].unique())):
#         df[f'{key}_{i+1}'] = np.zeros(len(df), 'int')
        array_list.append(np.zeros(len(df),'int'))
        
    for i in range(len(df)):
        value = df[key].values[i]
        array_list[value-1][i] = 1
    
    for i in range(len(df[key].unique())):
        df[f'{key}_{i+1}'] = array_list[i]
        
    df = df.drop([key], axis=1)
    return df

In [255]:
path = '/home/francisco/workspace/titanic_kaggle/titanic'

In [256]:
original_train_df = pd.read_csv(f'{path}/train.csv',low_memory=False)
original_test_df = pd.read_csv(f'{path}/test.csv',low_memory=False)

In [257]:
complete_df = pd.concat([original_train_df, original_test_df], ignore_index=True)

In [258]:
complete_df = name_to_status(complete_df)
# complete_df = add_ticket_type(complete_df)
complete_df = add_ticket_number(complete_df)
# complete_df = add_group_size(complete_df)
complete_df = complete_df.drop(['Ticket'], axis=1)
complete_df = cabin_to_deck(complete_df)
complete_df = add_age_estimated(complete_df)
complete_df['AgeMissing'] = np.zeros(len(complete_df),'int')
complete_df['AgeMissing'][(complete_df['Age'] != complete_df['Age'])] = 1
complete_df = fill_ages(complete_df,891)
# complete_df = add_life_stage(complete_df)
complete_df = add_fam_size(complete_df)
# complete_df = add_number_of_friends(complete_df)
# complete_df = add_fam_size_cat(complete_df)
complete_df['Fare'] = complete_df['Fare'].fillna(complete_df['Fare'].median())
complete_df = add_fare_per_person(complete_df)
complete_df = complete_df.drop(['Fare'], axis=1)
complete_df = add_age_times_class(complete_df)
complete_df['Embarked'] = complete_df['Embarked'].fillna('S')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complete_df['AgeMissing'][(complete_df['Age'] != complete_df['Age'])] = 1


In [259]:
# data = complete_df
# figure = plt.figure(figsize=(25, 7))
# plt.hist([data[data['Survived'] == 1]['Sex'], data[data['Survived'] == 0]['Sex']], 
#          stacked=True, color = ['g','r'],
#          bins = 50, label = ['Survived','Dead'])
# plt.xlabel('Fare')
# plt.ylabel('Number of passengers')
# plt.legend();

In [260]:
# A -> SCA4 A4 SP SOP FA SCOW AS SOPP FC SOTONO2 CASOTON A5 WC SOTONOQ
# B -> PC STONO2 PP SCPARIS CA SOC C FCC SWPP SC STONO SCAH WEP PPP
# C -> BASIC

In [261]:
# complete_df['TicketType'].iloc[891:].unique()

In [262]:
complete_df = categorify(complete_df,['Status', 'Deck', 'Sex', 'Embarked', 'TicketNumber'])
# complete_df = fill_missing(complete_df, ['Survived'])

In [263]:
for key in ['Pclass', 'Status', 'Deck', 'Embarked', 'TicketNumber']:
    complete_df = one_hot_encode(complete_df, key)

In [271]:
train_df = complete_df.iloc[:891]
valid_df = complete_df.iloc[712:891].reset_index()
test_df = complete_df.iloc[891:].reset_index()

In [272]:
# train_df = integerize(train_df, cat)
# valid_df = integerize(valid_df, cat)
# test_df = integerize(test_df, cat[1:])

In [273]:
# train_men_df = train_df[train_df['Sex'] == 2].drop(['Sex'], axis=1)
# valid_men_df = valid_df[valid_df['Sex'] == 2].drop(['Sex'], axis=1)
# train_women_df = train_df[train_df['Sex'] == 1].drop(['Sex'], axis=1)
# valid_women_df = valid_df[valid_df['Sex'] == 1].drop(['Sex'], axis=1)

In [274]:
parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 10000, 
              'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

rf = ensemble.RandomForestClassifier(**parameters)
# rf = ensemble.RandomForestClassifier(n_estimators=10000)

In [275]:
# #Fit man
# rf.fit(train_men_df.values[:,2:],train_men_df.values[:,1])
# preds_rf = rf.predict(valid_men_df.values[:,3:])
# metric(preds_rf.astype('int'), valid_men_df['Survived'].values)

In [276]:
# #Fit women
# rf.fit(train_women_df.values[:,2:],train_women_df.values[:,1])
# preds_rf = rf.predict(valid_women_df.values[:,3:])
# metric(preds_rf.astype('int'), valid_women_df['Survived'].values)

In [277]:
#Fit all data
rf.fit(train_df.values[:,2:],train_df.values[:,1])
preds_rf = rf.predict(valid_df.values[:,3:])
metric(preds_rf.astype('int'), valid_df['Survived'].values)

90.5027932960894

88.8268156424581

In [178]:
# original_train_df

In [62]:
rf_feat_importance(rf,valid_df.iloc[:,3:])
# rf_feat_importance(rf,train_df.drop(['Survived'], axis=1))

Unnamed: 0,cols,imp
14,Status_1,0.193558
0,Sex,0.188042
10,AgeClass,0.077365
16,Status_3,0.074504
15,Status_2,0.063069
...,...,...
45,TicketType_14,0.000000
41,TicketType_10,0.000000
39,TicketType_8,0.000000
28,Deck_9,0.000000


In [34]:
# valid_df['Sex'].hist()

In [86]:
# fig,ax = plt.subplots(figsize=(12, 4))
# plot_partial_dependence(rf, valid_df.iloc[:,3:], ['Age'],
# grid_resolution=20, ax=ax);

***

In [278]:
# preds_df = pd.DataFrame()
preds_rf_test = rf.predict(test_df.values[:,3:])
preds_rf_df = pd.DataFrame()

# preds_df['PassengerId'] = test_df['PassengerId'].values
# preds_df['Survived'] = preds_clf.astype('int')
preds_rf_df['PassengerId'] = test_df['PassengerId'].values
preds_rf_df['Survived'] = preds_rf_test.astype('int')

In [279]:
preds_rf_df.to_csv(f'{path}/submission_rf.csv',index=False)

***

In [54]:
preds_train = rf.predict(train_df.values[:,2:])

In [55]:
gt = train_df['Survived'].values

In [56]:
np.count_nonzero(preds_train == gt)/len(gt)

0.9876543209876543