In [1]:
import pandas as pd
pd.__version__

'2.2.3'

In [2]:
url = 'https://raw.githubusercontent.com/frandiego/cei/refs/heads/main/data/titanic.csv'

In [3]:
# read data
df = pd.read_csv(url)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#Â columns in lowercase -> no
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
# transform columns to lower
df.columns = df.columns.str.lower()

In [6]:
# any missing data? -> yes [age, cabin, embarked]
df.isna().sum()

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

In [7]:
# in pct? 
df.isna().mean()

passengerid    0.000000
survived       0.000000
pclass         0.000000
name           0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
ticket         0.000000
fare           0.000000
cabin          0.771044
embarked       0.002245
dtype: float64

In [8]:
# cabin has 77% of missing values, let's remove it
# ticket has no info, so let's remove it also
df = df.drop(['cabin', 'ticket'], axis=1)

In [9]:
# look at those rows with na in embarked column
df[df['embarked'].isna()]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


In [10]:
# checking following websites, we can see that we should fill with S (Southampton)
# https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html
# https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html
df['embarked'] = df['embarked'].fillna('S')

In [11]:
# check it out again
df[df['passengerid'].isin([62, 830])]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,S
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,S


In [12]:
# lets create a fill serie to fill age
fill_age = df.groupby(['survived', 'pclass', 'sex', 'embarked'])['age'].transform('median')
fill_age

0      25.0
1      36.0
2      23.0
3      35.0
4      25.0
       ... 
886    30.5
887    35.0
888    23.0
889    35.0
890    25.0
Name: age, Length: 891, dtype: float64

In [13]:
# fill the column age with the series created
df['age'] = df['age'].fillna(fill_age)

In [14]:
# check there is no missing data
df.isna().sum()

passengerid    0
survived       0
pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
dtype: int64

In [15]:
# create column 'is_female'
# boolean column
df['is_female'] = df['sex'] == 'female'
# boolean to integer
df['is_female'] = df['is_female'].astype(int)
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,is_female
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


In [16]:
# drop column sex ( now its redundant, and name )
df = df.drop(['name', 'sex'], axis=1)
df.head()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare,embarked,is_female
0,1,0,3,22.0,1,0,7.25,S,0
1,2,1,1,38.0,1,0,71.2833,C,1
2,3,1,3,26.0,0,0,7.925,S,1
3,4,1,1,35.0,1,0,53.1,S,1
4,5,0,3,35.0,0,0,8.05,S,0


In [17]:
# create a mapper for embarked column to add info
# now you don't need to find out whats S , C and Q (it's more readable)
mapper_embarked = {'S':'southampton', 'C': 'cherbourg', 'Q': 'queenstown'}

In [18]:
df['embarked'] = df['embarked'].map(mapper_embarked)
df.head()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare,embarked,is_female
0,1,0,3,22.0,1,0,7.25,southampton,0
1,2,1,1,38.0,1,0,71.2833,cherbourg,1
2,3,1,3,26.0,0,0,7.925,southampton,1
3,4,1,1,35.0,1,0,53.1,southampton,1
4,5,0,3,35.0,0,0,8.05,southampton,0


In [19]:
# create a dummy dataframe for the column embarked (on - hot encoding)
df_embarked = pd.get_dummies(df['embarked'], prefix='embarked')
df_embarked.head()

Unnamed: 0,embarked_cherbourg,embarked_queenstown,embarked_southampton
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True


In [20]:
# transform everything into integer
df_embarked = df_embarked.astype(int)
df_embarked.head()

Unnamed: 0,embarked_cherbourg,embarked_queenstown,embarked_southampton
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [23]:
# remove one of them, why? because if a passenger did't embarked in 
# cherbourg nor in queenstown, has embarked in southampton then
df_embarked = df_embarked.drop(['embarked_southampton'], axis=1)
df_embarked.head()

Unnamed: 0,embarked_cherbourg,embarked_queenstown
0,0,0
1,1,0
2,0,0
3,0,0
4,0,0


In [27]:
# append df and df_embarked vertically
df = pd.concat([df, df_embarked], axis=1)
df.head()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare,embarked,is_female,embarked_cherbourg,embarked_queenstown
0,1,0,3,22.0,1,0,7.25,southampton,0,0,0
1,2,1,1,38.0,1,0,71.2833,cherbourg,1,1,0
2,3,1,3,26.0,0,0,7.925,southampton,1,0,0
3,4,1,1,35.0,1,0,53.1,southampton,1,0,0
4,5,0,3,35.0,0,0,8.05,southampton,0,0,0


In [28]:
# remove embarked (redundant)
df = df.drop(['embarked'], axis=1)
df.head()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare,is_female,embarked_cherbourg,embarked_queenstown
0,1,0,3,22.0,1,0,7.25,0,0,0
1,2,1,1,38.0,1,0,71.2833,1,1,0
2,3,1,3,26.0,0,0,7.925,1,0,0
3,4,1,1,35.0,1,0,53.1,1,0,0
4,5,0,3,35.0,0,0,8.05,0,0,0


In [29]:
# the age should be integer
df['age'] = df['age'].astype(int)
df.head()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare,is_female,embarked_cherbourg,embarked_queenstown
0,1,0,3,22,1,0,7.25,0,0,0
1,2,1,1,38,1,0,71.2833,1,1,0
2,3,1,3,26,0,0,7.925,1,0,0
3,4,1,1,35,1,0,53.1,1,0,0
4,5,0,3,35,0,0,8.05,0,0,0
