# EDA

## Import data

In [1]:
import pandas as pd
import plotly.express as px
import seaborn as sns

titanicdf = pd.read_csv("../../data/train_v1.csv", index_col=0)
titanicdf.sample(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q
458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S
462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.05,,S
234,1,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S
553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q
212,1,2,"Cameron, Miss. Clear Annie",female,35.0,0,0,F.C.C. 13528,21.0,,S
178,0,1,"Isham, Miss. Ann Elizabeth",female,50.0,0,0,PC 17595,28.7125,C49,C
509,0,3,"Olsen, Mr. Henry Margido",male,28.0,0,0,C 4001,22.525,,S
435,0,1,"Silvey, Mr. William Baird",male,50.0,1,0,13507,55.9,E44,S
318,0,2,"Moraweck, Dr. Ernest",male,54.0,0,0,29011,14.0,,S


## Search for NaN's

In [2]:
titanicdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [3]:
px.bar(titanicdf.isna().sum())

In [4]:
print(f"Column \"Cabin\" has {687/891*100:0.1f}% missing values and will therefore be dropped for analysis.")
print("\"Age\" and \"Embarked\" need Imputation.")

Column "Cabin" has 77.1% missing values and will therefore be dropped for analysis.
"Age" and "Embarked" need Imputation.


In [5]:
died = titanicdf["Survived"].value_counts()[0]
surv = titanicdf["Survived"].value_counts()[1]
total= titanicdf["Survived"].value_counts().sum()

print(f"Out of {total} passenger {died} died and {surv} survived.")

Out of 891 passenger 549 died and 342 survived.


## Search for correlations

### Survival by Sex (Survival rate for females was higher)

In [6]:
px.histogram(titanicdf, x="Sex", color="Survived")

### Survival by Class (Pclasses 1 and 3 show correlations to survivability)

In [7]:
fig = px.histogram(titanicdf, x="Pclass", color="Survived")
fig.update_layout(bargap=0.2)
fig.show()

### Survival by Age (also shows correlations with survival)

In [8]:
px.histogram(titanicdf, x="Age", color="Survived", nbins=16)

### Survival by Embarked (Embarkment in "S" shows a higher chance of dying)

In [9]:
px.histogram(titanicdf, x="Embarked", color="Survived")

### Survival by Sibling/Spouse (Without a SibSp your chance of dying seems higher)

In [10]:
px.histogram(titanicdf, x="SibSp", color="Survived", nbins=16)

### Survival by Parent/Child (Same as SibSp)

In [11]:
px.histogram(titanicdf, x="Parch", color="Survived")

### Survival by Fare (If your Ticket was cheap you had a higher chance of dying)

In [12]:
px.histogram(titanicdf, x="Fare", color="Survived")

### Note: Name and Ticket will be excluded from consideration as they are not easily identifiable as ordinal or categirical data

## Further milestone tasks:

### 1st class survivors / Total number of 1st class passengers

In [13]:
first_class_pass_count = titanicdf["Pclass"].value_counts().loc[1]
first_class_survivors = titanicdf[(titanicdf["Survived"] == 1) & (titanicdf["Pclass"] == 1)].shape[0]

fcs = first_class_survivors / first_class_pass_count

print(f"{round(fcs*100,1):0.1f}% of first class passengers survived")

63.0% of first class passengers survived


### avg age survived vs. drowned

In [14]:
avg_age_sur = titanicdf[titanicdf["Survived"] == 1]["Age"].mean()
avg_age_dro = titanicdf[titanicdf["Survived"] == 0]["Age"].mean()
print(f"Average age of survivors: {avg_age_sur:0.1f} years")
print(f"Average age of drowned: {avg_age_dro:0.1f} years")

Average age of survivors: 28.3 years
Average age of drowned: 30.6 years


### create df counting surv vs. dead seperated by Pclass and sex

In [15]:
titanic_ssp = titanicdf[["Sex","Pclass","Survived"]]

In [16]:
tab1 = titanic_ssp[titanic_ssp["Survived"] == 1].groupby(["Sex","Pclass"]).count()
tab2 = titanic_ssp[titanic_ssp["Survived"] == 0].groupby(["Sex","Pclass"]).count().rename(columns={"Survived": "Drowned/Died"})
pd.concat([tab1, tab2], axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Drowned/Died
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
female,1,91,3
female,2,70,6
female,3,72,72
male,1,45,77
male,2,17,91
male,3,47,300
