In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from numpy import mean
from matplotlib.pyplot import figure

def process_data(data):
    data = numerize_sex(data)
    data = create_rich_category(data)
    data = update_rich_category(data)
    data = create_poor_category(data)
    data = create_middleclass_category(data)
    data = create_rich_age_selection(data)
    data = create_middleclass_age_selection(data)
    data = create_poor_age_selection(data)
    data = create_familysize_category(data)
    data = create_familycombo_category(data)
    data = drop_passengerid_and_class(data)
    data = create_title_category(data)
    data = numerize_embarked(data)
    data = create_titleembarked_category(data)
    data = create_ticketcategory_category(data)
    data = drop_remaining_irrelevant_columns(data)
    return data

def numerize_sex(data):
    data['Sex'] = data['Sex'].map({'male': 0, 'female':1})
    return data

def create_rich_category(data):
    data['Rich'] = data['Pclass']
    data.loc[data['Fare']>100, 'Rich'] = 1
    data.loc[data['Fare']<100, 'Rich'] = 0
    return data

def update_rich_category(data):
    data.loc[data['Fare']>76, 'Rich'] = 1
    return data

def create_poor_category(data):
    data['Poor'] = data['Rich']
    data.loc[data['Fare']<10, 'Poor'] = 1
    data.loc[data['Fare']>=10, 'Poor'] = 0
    return data

def create_middleclass_category(data):
    data['MiddleClass'] = df['Rich']
    data.loc[data['Fare']>10, 'MiddleClass'] = 1
    data.loc[data['Rich']==1, 'MiddleClass'] = 0
    data.loc[data['Poor']==1, 'MiddleClass'] = 0
    return data

def create_rich_age_selection(data):
    data['AgeSelectionWhenRich'] = data['Rich']
    richNs = (data['Rich']== 1) & (data['Age']<28)
    nonRichNs = (data['Rich']== 1) & (data['Age']>28)
    data.loc[richNs, 'AgeSelectionWhenRich'] = 1
    data.loc[nonRichNs, 'AgeSelectionWhenRich'] = 0
    return data

def create_middleclass_age_selection(data):
    data['AgeSelectionWhenMiddle'] = data['MiddleClass']
    middleClassNs = (data['MiddleClass']== 1) & (data['Age']<6)
    nonMiddleClassNs = (data['MiddleClass']== 1) & (data['Age']>6)
    data.loc[middleClassNs, 'AgeSelectionWhenMiddle'] = 1
    data.loc[nonMiddleClassNs, 'AgeSelectionWhenMiddle'] = 0
    return data

def create_poor_age_selection(data):
    data['AgeSelectionWhenPoor'] = data['Poor']
    poorNs = (data['Poor']== 1) & (data['Age']<14)
    nonPoorNs = (data['Poor']== 1) & (data['Age']>14)
    data.loc[poorNs, 'AgeSelectionWhenPoor'] = 1
    data.loc[nonPoorNs, 'AgeSelectionWhenPoor'] = 0
    return data

def create_familysize_category(data):
    data['FamilySize'] = data['SibSp'] + data['Parch']
    return data

def create_familycombo_category(data):
    data['FamilyCombo'] = data['Poor']
    richFamilyCombo = (data['FamilySize']== 4) & (data['Rich']==1)
    poorFamilyCombo = (data['FamilySize']== 6) & (data['Poor']==1)
    data.loc[data['Poor']==1, 'FamilyCombo'] = 0
    data.loc[data['Poor']==0, 'FamilyCombo'] = 0
    data.loc[richFamilyCombo, 'FamilyCombo'] = 1
    data.loc[poorFamilyCombo, 'FamilyCombo'] = 1
    return data
    
def drop_passengerid_and_class(data):
    data.drop('PassengerId', axis=1, errors='ignore', inplace=True)
    data.drop('Pclass', axis=1, errors='ignore', inplace=True)
    return data

def create_title_category(data):
    data['Title'] = data['Name']
    data.loc[data['Name'].str.find('Mr.')>-1, 'Title'] = 0
    data.loc[data['Name'].str.find('Master.')>-1, 'Title'] = 1
    data.loc[data['Name'].str.find('Miss.')>-1, 'Title'] = 2
    data.loc[data['Name'].str.find('Mrs.')>-1, 'Title'] = 3
    data.loc[data['Name'].str.find('Dr.')>-1, 'Title'] = 4
    data.loc[data['Name'].str.find('Rev.')>-1, 'Title'] = 5
    data.loc[data['Name'].str.find('Don.')>-1, 'Title'] = 6
    data.loc[data['Name'].str.find('Mme.')>-1, 'Title'] = 7
    data.loc[data['Name'].str.find('Major.')>-1, 'Title'] = 8
    data.loc[data['Name'].str.find('Ms.')>-1, 'Title'] = 9
    data.loc[data['Name'].str.find('Lady.')>-1, 'Title'] = 10
    data.loc[data['Name'].str.find('Col.')>-1, 'Title'] = 11
    data.loc[data['Name'].str.find('Sir.')>-1, 'Title'] = 12
    data.loc[data['Name'].str.find('Mlle.')>-1, 'Title'] = 13
    data.loc[data['Name'].str.find('Countess.')>-1, 'Title'] = 14
    data.loc[data['Name'].str.find('Capt.')>-1, 'Title'] = 15
    data.loc[data['Name'].str.find('Jonkheer.')>-1, 'Title'] = 16
    return data

def numerize_embarked(data):
    data['Embarked'] = data['Embarked'].map({'S':0, 'C':1, 'Q':2})
    return data

def create_titleembarked_category(data):
    data['TitleEmbarkedSelection'] = data['Poor']

    titleSouthamptonRich = (data['Rich']== 1) & (data['Embarked']==0) & ((data['Title']==1) | (data['Title']==2) | (data['Title']==3) | (data['Title']==4) | (data['Title']==14))
    titleCherbourgRich = (data['Rich']== 1) & (data['Embarked']==1) & ((data['Title']==2) | (data['Title']==3))
    titleSouthamptonMiddleClass = (data['MiddleClass']== 1) & (data['Embarked']==0) & (data['Title']==9)
    titleCherbourgMiddleClass = (data['MiddleClass']== 1) & (data['Embarked']==1) & ((data['Title']==1) | (data['Title']==7) | (data['Title']==10) | (data['Title']==11) | (data['Title']==12) | (data['Title']==13))
    titleCherbourgPoor = (data['Poor']== 1) & (data['Embarked']==1) & ((data['Title']==1) | (data['Title']==2) | (data['Title']==3))

    data.loc[data['Poor']==1, 'TitleEmbarkedSelection'] = 0
    data.loc[data['Poor']==0, 'TitleEmbarkedSelection'] = 0

    data.loc[titleSouthamptonRich, 'TitleEmbarkedSelection'] = 1
    data.loc[titleCherbourgRich, 'TitleEmbarkedSelection'] = 1
    data.loc[titleSouthamptonMiddleClass, 'TitleEmbarkedSelection'] = 1
    data.loc[titleCherbourgMiddleClass, 'TitleEmbarkedSelection'] = 1
    data.loc[titleCherbourgPoor, 'TitleEmbarkedSelection'] = 1
    return data

def create_ticketcategory_category(data):
    data['TicketCategory'] = data['Ticket']

    data.loc[data['Ticket'].str.find('STON')>-1, 'TicketCategory'] = 5
    data.loc[data['Ticket'].str.find('SOTON')>-1, 'TicketCategory'] = 6
    data.loc[data['Ticket'].str.find('SC.')>-1, 'TicketCategory'] = 7
    data.loc[data['Ticket'].str.find('LINE')>-1, 'TicketCategory'] = 8
    data.loc[data['Ticket'].str.find('C.A')>-1, 'TicketCategory'] = 9
    data.loc[data['Ticket'].str.find('Paris')>-1, 'TicketCategory'] = 10
    data.loc[data['Ticket'].str.find('PARIS')>-1, 'TicketCategory'] = 11
    data.loc[data['Ticket'].str.find('PC')>-1, 'TicketCategory'] = 12
    data.loc[data['Ticket'].str.find('PP')>-1, 'TicketCategory'] = 13
    data.loc[data['Ticket'].str.find('W./C')>-1, 'TicketCategory'] = 14
    data.loc[data['Ticket'].str.find('CA')>-1, 'TicketCategory'] = 15
    data.loc[data['Ticket'].str.find('S.O./P.P.')>-1, 'TicketCategory'] = 16
    data.loc[data['Ticket'].str.find('A/5')>-1, 'TicketCategory'] = 17
    data.loc[data['Ticket'].str.find('A/4')>-1, 'TicketCategory'] = 18
    data.loc[data['Ticket'].str.find('C ')>-1, 'TicketCategory'] = 19
    data.loc[data['Ticket'].str.find('SO')>-1, 'TicketCategory'] = 20
    data.loc[data['Ticket'].str.find('SC')>-1, 'TicketCategory'] = 21
    data.loc[data['Ticket'].str.find('F.C.C')>-1, 'TicketCategory'] = 22
    data.loc[data['Ticket'].str.find('F.C.')>-1, 'TicketCategory'] = 23
    data.loc[data['Ticket'].str.find('A./5')>-1, 'TicketCategory'] = 24
    data.loc[data['Ticket'].str.find('A.5')>-1, 'TicketCategory'] = 25
    data.loc[data['Ticket'].str.find('Fa')>-1, 'TicketCategory'] = 26
    data.loc[data['Ticket'].str.find('WE/P')>-1, 'TicketCategory'] = 27
    data.loc[data['Ticket'].str.find('S.O.C')>-1, 'TicketCategory'] = 28
    data.loc[data['Ticket'].str.find('W.E.P')>-1, 'TicketCategory'] = 29
    data.loc[data['Ticket'].str.find('S.P')>-1, 'TicketCategory'] = 30
    data.loc[data['Ticket'].str.find('S.C')>-1, 'TicketCategory'] = 31
    data.loc[data['Ticket'].str.find('A/S')>-1, 'TicketCategory'] = 32
    data.loc[data['Ticket'].str.find('A4.')>-1, 'TicketCategory'] = 33
    data.loc[data['Ticket'].str.find('S.O.P.')>-1, 'TicketCategory'] = 34


    sevenNumbers = (data['TicketCategory'].str.isnumeric()) & (data['TicketCategory'].str.len()==7)
    sixNumbers = (data['TicketCategory'].str.isnumeric()) & (data['TicketCategory'].str.len()==6) 
    fiveNumbers = (data['TicketCategory'].str.isnumeric()) & (data['TicketCategory'].str.len()==5)
    fourNumbers = (data['TicketCategory'].str.isnumeric()) & (data['TicketCategory'].str.len()==4)
    threeNumbers = (data['TicketCategory'].str.isnumeric()) & (data['TicketCategory'].str.len()==3)

    data.loc[sevenNumbers, 'TicketCategory'] = 0
    data.loc[sixNumbers, 'TicketCategory'] = 1
    data.loc[fiveNumbers, 'TicketCategory'] = 2
    data.loc[fourNumbers, 'TicketCategory'] = 3 
    data.loc[threeNumbers, 'TicketCategory'] = 4
    return data

def drop_remaining_irrelevant_columns(data):
    data.drop(['Age','Ticket','Name','Cabin'], axis=1, errors='ignore', inplace=True)
    return data
    
def save_predictions(data, file_name):
    df = pd.read_csv('./data/gender_submission.csv')
    df['Survived'] = data
    return df.to_csv(file_name, index=False)

def correlation_graph(data):
    corr = data.corr()
    mask = np.triu(np.ones_like(corr, dtype=np.bool)) # Generate a mask for the upper triangle
    cmap = sns.diverging_palette(290, 10, as_cmap=True) # Generate a custom diverging colormap
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.6, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .9}) # Draw the heatmap with the mask and correct aspect ratio
