In [55]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Pandas Exercise


In [56]:
def df_info(df: pd.DataFrame) -> None:
    return df.head(n=20).style

### Titanic Dataset

-   PassengerId is the unique id of the row and it doesn't have any effect on target
-   Survived is the target variable we are trying to predict (0 or 1):
    -   1 = Survived
    -   0 = Not Survived
-   Pclass (Passenger Class) is the socio-economic status of the passenger:
    -   1 = Upper Class
    -   2 = Middle Class
    -   3 = Lower Class
-   Name
-   Sex
-   Age
-   SibSp is the total number of the passengers' siblings and spouse
-   Parch is the total number of the passengers' parents and children
-   Ticket is the ticket number of the passenger
-   Fare is the passenger fare
-   Cabin is the cabin number of the passenger
-   Embarked is port of embarkation and it is a categorical feature:
    -   C = Cherbourg
    -   Q = Queenstown
    -   S = Southampton

\*Embarked: sich einschiffen


In [57]:
cols = [
    "Survived",
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Cabin",
    "Embarked",
]

df = pd.read_csv("../data/titanic/dataset.csv")
print(df.columns)

df = pd.DataFrame(
    df[cols],
    index=df["PassengerId"],
)
df.index.name = "ID"

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [58]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C85,C
2,1.0,3.0,female,26.0,0.0,0.0,7.925,,S
3,1.0,1.0,female,35.0,1.0,0.0,53.1,C123,S
4,0.0,3.0,male,35.0,0.0,0.0,8.05,,S
5,0.0,3.0,male,,0.0,0.0,8.4583,,Q
6,0.0,1.0,male,54.0,0.0,0.0,51.8625,E46,S
7,0.0,3.0,male,2.0,3.0,1.0,21.075,,S
8,1.0,3.0,female,27.0,0.0,2.0,11.1333,,S
9,1.0,2.0,female,14.0,1.0,0.0,30.0708,,C
10,1.0,3.0,female,4.0,1.0,1.0,16.7,G6,S


# Exercise 1:

-   Replace nan values at age, pclass and cabin column by -1 for numeric and "None" for str values
-   Change the dtype to int8 for age and pclass


In [59]:
df["Age"] = df["Age"].fillna(
    value=-1,
)

In [60]:
df["Pclass"] = df["Pclass"].fillna(
    value=-1,
)

In [61]:
df["Cabin"] = df["Cabin"].fillna(
    value="None",
)

In [62]:
df[["Age", "Pclass"]] = df[["Age", "Pclass"]].astype(np.int8)

In [63]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1,female,38,1.0,0.0,71.2833,C85,C
2,1.0,3,female,26,0.0,0.0,7.925,,S
3,1.0,1,female,35,1.0,0.0,53.1,C123,S
4,0.0,3,male,35,0.0,0.0,8.05,,S
5,0.0,3,male,-1,0.0,0.0,8.4583,,Q
6,0.0,1,male,54,0.0,0.0,51.8625,E46,S
7,0.0,3,male,2,3.0,1.0,21.075,,S
8,1.0,3,female,27,0.0,2.0,11.1333,,S
9,1.0,2,female,14,1.0,0.0,30.0708,,C
10,1.0,3,female,4,1.0,1.0,16.7,G6,S


In [64]:
df_info(df[df.isna().any(axis=1)])

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
61,1.0,1,female,38,0.0,0.0,80.0,B28,
829,1.0,1,female,62,0.0,0.0,80.0,B28,
891,,-1,,-1,,,,,


# Exercise 2:

-   Drop all rows that contains a nan value


In [65]:
df = df.dropna(axis=0)

df_info(df[df.isna().any(axis=1)])

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [66]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1,female,38,1.0,0.0,71.2833,C85,C
2,1.0,3,female,26,0.0,0.0,7.925,,S
3,1.0,1,female,35,1.0,0.0,53.1,C123,S
4,0.0,3,male,35,0.0,0.0,8.05,,S
5,0.0,3,male,-1,0.0,0.0,8.4583,,Q
6,0.0,1,male,54,0.0,0.0,51.8625,E46,S
7,0.0,3,male,2,3.0,1.0,21.075,,S
8,1.0,3,female,27,0.0,2.0,11.1333,,S
9,1.0,2,female,14,1.0,0.0,30.0708,,C
10,1.0,3,female,4,1.0,1.0,16.7,G6,S


# Exercise 3

-   Compute the min, max, median and mean of the age for the groups:
    -   Survived and Male
    -   Survived and Female
    -   Not survived and Male
    -   Not survived and Female


In [67]:
grouped = df.groupby(["Survived", "Sex"])

In [68]:
grouped.Age.agg(["min", "max", "median", "mean"])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,median,mean
Survived,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,female,-1,57,20.0,19.567901
0.0,male,-1,74,24.0,24.079229
1.0,female,-1,63,24.0,24.004329
1.0,male,-1,80,26.0,23.091743


# Exercise 4

-   Compute the most likely age to survive (except the -1 fillvalue)


In [69]:
survived = df["Survived"] == 1
has_age = df["Age"] > 0

In [70]:
print(np.count_nonzero(survived))

340


In [71]:
print(np.count_nonzero(has_age))

704


In [72]:
idxs = survived & has_age
print(np.count_nonzero(idxs))

281


In [73]:
df_sliced = df[idxs]
print(df_sliced.head(n=10))

    Survived  Pclass     Sex  Age  SibSp  Parch     Fare Cabin Embarked
ID                                                                     
1        1.0       1  female   38    1.0    0.0  71.2833   C85        C
2        1.0       3  female   26    0.0    0.0   7.9250  None        S
3        1.0       1  female   35    1.0    0.0  53.1000  C123        S
8        1.0       3  female   27    0.0    2.0  11.1333  None        S
9        1.0       2  female   14    1.0    0.0  30.0708  None        C
10       1.0       3  female    4    1.0    1.0  16.7000    G6        S
11       1.0       1  female   58    0.0    0.0  26.5500  C103        S
15       1.0       2  female   55    0.0    0.0  16.0000  None        S
21       1.0       2    male   34    0.0    0.0  13.0000   D56        S
22       1.0       3  female   15    0.0    0.0   8.0292  None        Q


In [74]:
survived_age_counts = df_sliced.Age.value_counts()
print(f"Result: {survived_age_counts}")

Result: Age
24    15
35    11
27    11
36    11
22    11
30    10
32    10
18     9
19     9
31     8
29     8
28     7
4      7
42     6
25     6
26     6
16     6
34     6
17     6
48     6
33     6
40     6
45     5
23     5
50     5
21     5
3      5
39     5
1      5
38     4
5      4
49     4
15     4
44     3
20     3
54     3
14     3
2      3
52     3
58     3
9      2
51     2
6      2
56     2
13     2
60     2
63     2
41     2
8      2
55     1
7      1
62     1
53     1
80     1
37     1
12     1
43     1
11     1
47     1
Name: count, dtype: int64


In [75]:
df_sliced = df[(df["Survived"] == 1) & (df["Age"] > 0)]
survived_age_counts = df_sliced.Age.value_counts()

print(f"Result: {survived_age_counts.iloc[0]}")

Result: 15
