In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
df=pd.read_csv("ds_train.csv")
df=df.drop(["PassengerId","Ticket",],axis=1)                
df.insert(5,"FamSize",(df.Parch+df.SibSp),True)             
#Add relatives together in a column to get the family size
df=df.rename(columns={"Parch":"ParCh"})                     
df["Survived"]=df["Survived"].replace({"yes":1,"no":0})
df["Age"].fillna(value=df["Age"].ffill(),inplace=True)

In [None]:
#Age and Embarked are missing calues, but we can fill them. Cabin is missing a lot, so we can not extrapolate based on existing data
df["Age"].fillna(value=df["Age"].mean(),inplace=True)
df['Embarked'].fillna(df["Embarked"].mode()[0],inplace=True)

In [None]:
#Create a function to extract the deck code from the cabin number, X for NaN
def deck(cabin_no):
    deck_no="X"
    if not pd.isna(cabin_no):
        deck_no=re.search(r"([A-Za-z]+)",str(cabin_no)).group(0)
    return deck_no

In [None]:
#Put the results from the function in a new column
df["Deck"]=df["Cabin"].map(lambda x: deck(x))
df.drop(columns=["Cabin"],inplace=True)
df["Deck"]=df["Deck"].replace("T","B")
#Replace deck T as it was storage. Judging by fare, class and frequency, we will replace it with deck B
#No missing values in the dataset

In [None]:
#Function to get name titles 
def title_gen(namen):
    title = re.search(r"(,[A-Za-z ]+\.?)", namen).group(0).replace(", ","")
    return title

df["Title"]=df["Name"].map(lambda x: title_gen(x))

In [None]:
#Function to extract the title from the name and sort them based on sex and social ranking
def title_mod(i):
    title=i["Title"]
    if title in ['Don.', 'Major.', 'Lady.', 'Sir.', 'Col.', 'Capt.',
       'the Countess.', 'Jonkheer.']:
        if i["Sex"]=="female":
            return "Noblewoman"
        else:
            return "Nobleman"
    elif title in ['Dr.','Mrs.', 'Miss.','Mme.','Ms.','Mlle.'] and i["Sex"]=="female":
        return "Mrs"
    else:
        return "Mr"

df["Title"]=df.apply(lambda x: title_mod(x), axis=1)
#We have created a column with each passenger's title, so we can drop the name column
df.drop(columns=["Name"],inplace=True)

In [None]:
#Find mortality numbers and rate for children divided by class
temp_list=np.zeros((3,4))
for i in range(0,3):
    temp=df.loc[(df.Age<18) & (df.Pclass==i+1)]["Survived"]
    temp_list[i,0]=temp.sum()
    temp_list[i,1]=len(temp)
    temp_list[i,2]=temp_list[i,1]-temp_list[i,0]
    temp_list[i,3]=temp_list[i,0]/temp_list[i,1]
child=pd.DataFrame(temp_list)
temp={0:"1",1:"2",2:"3"}
child=child.rename(index=temp)
child.index.names=["Class"]
temp={0:"Survived",1:"Total",2:"Dead",3:"Surv.Rate"}
child=child.rename(columns=temp)

In [None]:
#Find mortality numbers and rate for adults divided by class
temp_list=np.zeros((3,4))
for i in range(0,3):
    temp=df.loc[(df.Age>=18) & (df.Sex=="male") & (df.Pclass==i+1)]["Survived"]
    temp_list[i,0]=temp.sum()
    temp_list[i,1]=len(temp)
    temp_list[i,2]=temp_list[i,1]-temp_list[i,0]
    temp_list[i,3]=temp_list[i,0]/temp_list[i,1]
male_vict=pd.DataFrame(temp_list)
temp={0:"1",1:"2",2:"3"}
male_vict=male_vict.rename(index=temp)
male_vict.index.names=["Class"]
temp={0:"Survived",1:"Total",2:"Dead",3:"Surv.Rate"}
male_vict=male_vict.rename(columns=temp)

In [None]:
temp_list=np.zeros((3,4))
for i in range(0,3):
    temp=df.loc[(df.Age>=18) & (df.Sex=="female") & (df.Pclass==i+1)]["Survived"]
    temp_list[i,0]=temp.sum()
    temp_list[i,1]=len(temp)
    temp_list[i,2]=temp_list[i,1]-temp_list[i,0]
    temp_list[i,3]=temp_list[i,0]/temp_list[i,1]
fem_vict=pd.DataFrame(temp_list)
temp={0:"1",1:"2",2:"3"}
fem_vict=fem_vict.rename(index=temp)
fem_vict.index.names=["Class"]
temp={0:"Survived",1:"Total",2:"Dead",3:"Surv.Rate"}
fem_vict=fem_vict.rename(columns=temp)

In [None]:
print(male_vict,fem_vict,child)
#Almost all women survived, especially in 1st and 2nd class, half of them in 3rd class
#Majority of children survived, especially in 1st and 2nd class, and about 40% in 3rd class

In [None]:
fam_surv=df[["FamSize","Survived"]]
fam_surv.groupby(by="FamSize").mean()
#Most of the survivors seems to have a family size of 1-3 persons
#Single passengers and larger families has higher mortality rates

In [None]:
temp=df[["Deck","Survived"]]
print(temp.groupby(by="Deck").mean())
#Passengers at decks B, D and E had a higher survival rate, followed by decks C and F
#Decks G and A had ~50% survival rate, while passengers of unknown decks were more likely to perish