In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
import warnings
warnings.filterwarnings("ignore")

#### **Load and Basic Info**

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [4]:
#  Show first 5 rows
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
# Show last 5 rows
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [6]:
#  Check dataset shape
df.shape

(891, 15)

In [7]:
#  Get column names
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [8]:
#  Check data types
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [9]:
#  Basic stats
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
#  Info summary
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [11]:
#  Count null values
df.isnull().sum()


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [12]:

#  Show unique values in 'sex'
df['sex'].unique()

array(['male', 'female'], dtype=object)

#### Selection and Filtering

In [13]:
#  Select 'age' column
df['age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [14]:
#  Select multiple columns
df[['age', 'sex', 'fare']]

Unnamed: 0,age,sex,fare
0,22.0,male,7.2500
1,38.0,female,71.2833
2,26.0,female,7.9250
3,35.0,female,53.1000
4,35.0,male,8.0500
...,...,...,...
886,27.0,male,13.0000
887,19.0,female,30.0000
888,,female,23.4500
889,26.0,male,30.0000


In [15]:
# 13. Filter males only
df[df['sex'] == 'male']

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.5000,S,Second,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [16]:
# 14. Filter females only
df[df['sex'] == 'female']

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [17]:
# 15. Age above 60
df[df['age'] > 60]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
170,0,1,male,61.0,0,0,33.5,S,First,man,True,B,Southampton,no,True
252,0,1,male,62.0,0,0,26.55,S,First,man,True,C,Southampton,no,True
275,1,1,female,63.0,1,0,77.9583,S,First,woman,False,D,Southampton,yes,False
280,0,3,male,65.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
326,0,3,male,61.0,0,0,6.2375,S,Third,man,True,,Southampton,no,True
438,0,1,male,64.0,1,4,263.0,S,First,man,True,C,Southampton,no,False


In [18]:
# 16. Survived passengers
df[df['survived'] == 1]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [19]:
# 17. Did not survive
df[df['survived'] == 0]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False


In [20]:
#  Non-null deck values
df[df['deck'].notnull()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [21]:
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


In [22]:
# Null age values
df[df['age'].isnull()]


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True


#### Sorting & Indexing

In [23]:
#  Sort by age
df.sort_values(by='age')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False
755,1,2,male,0.67,1,1,14.5000,S,Second,child,False,,Southampton,yes,False
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
78,1,2,male,0.83,0,2,29.0000,S,Second,child,False,,Southampton,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [24]:
#  Sort by fare descending
df.sort_values(by='fare', ascending=False)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
88,1,1,female,23.0,3,2,263.0000,S,First,woman,False,C,Southampton,yes,False
27,0,1,male,19.0,3,2,263.0000,S,First,man,True,C,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633,0,1,male,,0,0,0.0000,S,First,man,True,,Southampton,no,True
413,0,2,male,,0,0,0.0000,S,Second,man,True,,Southampton,no,True
822,0,1,male,38.0,0,0,0.0000,S,First,man,True,,Southampton,no,True
732,0,2,male,,0,0,0.0000,S,Second,man,True,,Southampton,no,True


In [25]:
#  Reset index
df.reset_index(drop=True)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [26]:
#  Set 'embarked' as index
df.set_index('embarked')

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alive,alone
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
S,0,3,male,22.0,1,0,7.2500,Third,man,True,,Southampton,no,False
C,1,1,female,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,yes,False
S,1,3,female,26.0,0,0,7.9250,Third,woman,False,,Southampton,yes,True
S,1,1,female,35.0,1,0,53.1000,First,woman,False,C,Southampton,yes,False
S,0,3,male,35.0,0,0,8.0500,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S,0,2,male,27.0,0,0,13.0000,Second,man,True,,Southampton,no,True
S,1,1,female,19.0,0,0,30.0000,First,woman,False,B,Southampton,yes,True
S,0,3,female,,1,2,23.4500,Third,woman,False,,Southampton,no,False
C,1,1,male,26.0,0,0,30.0000,First,man,True,C,Cherbourg,yes,True


In [27]:
#  Sort by 'class' then 'age'
df.sort_values(by=['class', 'age'])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
305,1,1,male,0.92,1,2,151.5500,S,First,child,False,C,Southampton,yes,False
297,0,1,female,2.00,1,2,151.5500,S,First,child,False,C,Southampton,no,False
445,1,1,male,4.00,0,2,81.8583,S,First,child,False,A,Southampton,yes,False
802,1,1,male,11.00,1,2,120.0000,S,First,child,False,B,Southampton,yes,False
435,1,1,female,14.00,1,2,120.0000,S,First,child,False,B,Southampton,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [28]:
# Select rows by iloc (0–4)
df.iloc[0:5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [29]:
# Select by loc - row 0, column 'age'
df.loc[0, 'age']

22.0

In [30]:
#  Get every 5th row
df.iloc[::5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
15,1,2,female,55.0,0,0,16.0000,S,Second,woman,False,,Southampton,yes,True
20,0,2,male,35.0,0,0,26.0000,S,Second,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,0,3,male,26.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False


In [31]:
#  Top 10 youngest
df.nsmallest(10, 'age')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False
755,1,2,male,0.67,1,1,14.5,S,Second,child,False,,Southampton,yes,False
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False
831,1,2,male,0.83,1,1,18.75,S,Second,child,False,,Southampton,yes,False
305,1,1,male,0.92,1,2,151.55,S,First,child,False,C,Southampton,yes,False
164,0,3,male,1.0,4,1,39.6875,S,Third,child,False,,Southampton,no,False
172,1,3,female,1.0,1,1,11.1333,S,Third,child,False,,Southampton,yes,False
183,1,2,male,1.0,2,1,39.0,S,Second,child,False,F,Southampton,yes,False


In [32]:
#  Top 10 oldest
df.nlargest(10, 'age')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
672,0,2,male,70.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
745,0,1,male,70.0,1,1,71.0,S,First,man,True,B,Southampton,no,False
33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False
280,0,3,male,65.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


#### Missing Value Handling

In [33]:
# Fill age with median
df['age'].fillna(df['age'].median(), inplace=True)


In [34]:
#  Fill embarked with mode
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

In [35]:
#  Drop all rows with any nulls
df.dropna()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [36]:
#  Drop column with many nulls
df.drop(columns='deck')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [37]:
# Fill embark_town with 'Unknown'
df['embark_town'].fillna('Unknown', inplace=True)


In [38]:
#  Fill embarked with 'S'
df['embarked'].fillna('S', inplace=True)

In [39]:
# 37. Check how many NaNs
df.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      0
alive            0
alone            0
dtype: int64

In [40]:
#  Drop rows where age is null
df[df['age'].notnull()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [41]:
#  Replace NaNs in 'embark_town' with 'Unknown'
df['embark_town'].fillna('Unknown', inplace=True)

In [42]:
#  Fill age with mean
df['age'].fillna(df['age'].mean())

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [43]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


#### GroupBy & Aggregation

In [44]:
#  Group by sex and get mean age
df.groupby('sex')['age'].mean()

sex
female    27.929936
male      30.140676
Name: age, dtype: float64

In [45]:
#  Group by class and fare mean
df.groupby('class')['fare'].mean()

class
First     84.154687
Second    20.662183
Third     13.675550
Name: fare, dtype: float64

In [46]:
#  Survived count by class
df.groupby('class')['survived'].sum()

class
First     136
Second     87
Third     119
Name: survived, dtype: int64

In [47]:
#  Count passengers per class
df['class'].value_counts()

class
Third     491
First     216
Second    184
Name: count, dtype: int64

In [48]:
#  Average fare by sex
df.groupby('sex')['fare'].mean()

sex
female    44.479818
male      25.523893
Name: fare, dtype: float64

In [49]:
# Median age by class
df.groupby('class')['age'].median()

class
First     35.0
Second    28.0
Third     28.0
Name: age, dtype: float64

In [50]:
#  Groupby sex and survived
df.groupby(['sex', 'survived'])['age'].count()

sex     survived
female  0            81
        1           233
male    0           468
        1           109
Name: age, dtype: int64

In [51]:
#  Pivot: class vs survived
df.pivot_table(index='class', columns='survived', values='age', aggfunc='mean')

survived,0,1
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,40.55625,34.609706
Second,33.14433,25.998046
Third,26.951613,22.747227


In [52]:

# 49. Average age by class and sex
df.groupby(['class', 'sex'])['age'].mean()

class   sex   
First   female    33.978723
        male      38.995246
Second  female    28.703947
        male      30.512315
Third   female    23.572917
        male      26.911873
Name: age, dtype: float64

In [53]:
# Grouped size
df.groupby('embarked').size()

embarked
C    168
Q     77
S    646
dtype: int64

#### New Columns & Transform

In [54]:
#  Create child column (age < 12)
df['is_child'] = df['age'] < 12

In [55]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,is_child
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,False


In [56]:
#  Create elderly column (age > 60)
df['is_elderly'] = df['age'] > 60

In [57]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,is_child,is_elderly
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,False,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,False,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,False,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,False,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,False,False


In [58]:
# Age bucket column
df['age_group'] = pd.cut(df['age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])


In [59]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,is_child,is_elderly,age_group
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,False,False,Adult
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,False,False,Adult
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,False,False,Adult
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,False,False,Adult
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,False,False,Adult


In [60]:
#  Fare per person (if sibsp + parch > 0)
df['total_people'] = df['sibsp'] + df['parch'] + 1
df['fare_per_person'] = df['fare'] / df['total_people']


In [61]:
#  Sex encoded
df['sex_encoded'] = df['sex'].map({'male': 0, 'female': 1})

In [62]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,deck,embark_town,alive,alone,is_child,is_elderly,age_group,total_people,fare_per_person,sex_encoded
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,,Southampton,no,False,False,False,Adult,2,3.625,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,C,Cherbourg,yes,False,False,False,Adult,2,35.64165,1
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,,Southampton,yes,True,False,False,Adult,1,7.925,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,C,Southampton,yes,False,False,False,Adult,2,26.55,1
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,,Southampton,no,True,False,False,Adult,1,8.05,0


In [63]:
#  Class numeric
df['class_num'] = df['class'].map({'First': 1, 'Second': 2, 'Third': 3})

In [64]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,embark_town,alive,alone,is_child,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,Southampton,no,False,False,False,Adult,2,3.625,0,3
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,Cherbourg,yes,False,False,False,Adult,2,35.64165,1,1
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,Southampton,yes,True,False,False,Adult,1,7.925,1,3
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,Southampton,yes,False,False,False,Adult,2,26.55,1,1
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,Southampton,no,True,False,False,Adult,1,8.05,0,3


In [65]:
#  Survival as boolean
df['survived_bool'] = df['survived'].astype(bool)

In [66]:
#  Name length
df['name_length'] = df['who'].apply(lambda x: len(str(x)))

In [67]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,alone,is_child,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,False,False,False,Adult,2,3.625,0,3,False,3
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,False,False,Adult,2,35.64165,1,1,True,5
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,True,False,False,Adult,1,7.925,1,3,True,5
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,False,False,False,Adult,2,26.55,1,1,True,5
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,True,False,False,Adult,1,8.05,0,3,False,3


In [68]:
#  Title from name
df['title'] = df['who']

In [69]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_child,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,False,False,Adult,2,3.625,0,3,False,3,man
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,False,Adult,2,35.64165,1,1,True,5,woman
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,False,False,Adult,1,7.925,1,3,True,5,woman
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,False,False,Adult,2,26.55,1,1,True,5,woman
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,False,False,Adult,1,8.05,0,3,False,3,man


In [70]:
#  Log fare
df['log_fare'] = np.log1p(df['fare'])

In [71]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,False,Adult,2,3.625,0,3,False,3,man,2.110213
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.64165,1,1,True,5,woman,4.280593
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,False,Adult,1,7.925,1,3,True,5,woman,2.188856
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,False,Adult,2,26.55,1,1,True,5,woman,3.990834
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,False,Adult,1,8.05,0,3,False,3,man,2.202765


#### Boolean Masks & Filters

In [72]:
#  Males under 30
df[(df['sex'] == 'male') & (df['age'] < 30)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,False,Adult,2,3.625000,0,3,False,3,man,2.110213
5,0,3,male,28.0,0,0,8.4583,Q,Third,man,...,False,Adult,1,8.458300,0,3,False,3,man,2.246893
7,0,3,male,2.0,3,1,21.0750,S,Third,child,...,False,Child,5,4.215000,0,3,False,5,child,3.094446
12,0,3,male,20.0,0,0,8.0500,S,Third,man,...,False,Adult,1,8.050000,0,3,False,3,man,2.202765
16,0,3,male,2.0,4,1,29.1250,Q,Third,child,...,False,Child,6,4.854167,0,3,False,5,child,3.405355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878,0,3,male,28.0,0,0,7.8958,S,Third,man,...,False,Adult,1,7.895800,0,3,False,3,man,2.185579
883,0,2,male,28.0,0,0,10.5000,S,Second,man,...,False,Adult,1,10.500000,0,2,False,3,man,2.442347
884,0,3,male,25.0,0,0,7.0500,S,Third,man,...,False,Adult,1,7.050000,0,3,False,3,man,2.085672
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,False,Adult,1,13.000000,0,2,False,3,man,2.639057


In [73]:
# Females over 50
df[(df['sex'] == 'female') & (df['age'] > 50)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
11,1,1,female,58.0,0,0,26.55,S,First,woman,...,False,Adult,1,26.55,1,1,True,5,woman,3.316003
15,1,2,female,55.0,0,0,16.0,S,Second,woman,...,False,Adult,1,16.0,1,2,True,5,woman,2.833213
195,1,1,female,58.0,0,0,146.5208,C,First,woman,...,False,Adult,1,146.5208,1,1,True,5,woman,4.993969
268,1,1,female,58.0,0,1,153.4625,S,First,woman,...,False,Adult,2,76.73125,1,1,True,5,woman,5.039951
275,1,1,female,63.0,1,0,77.9583,S,First,woman,...,True,Senior,2,38.97915,1,1,True,5,woman,4.36892
366,1,1,female,60.0,1,0,75.25,C,First,woman,...,False,Adult,2,37.625,1,1,True,5,woman,4.334017
483,1,3,female,63.0,0,0,9.5875,S,Third,woman,...,True,Senior,1,9.5875,1,3,True,5,woman,2.359674
496,1,1,female,54.0,1,0,78.2667,C,First,woman,...,False,Adult,2,39.13335,1,1,True,5,woman,4.372818
513,1,1,female,54.0,1,0,59.4,C,First,woman,...,False,Adult,2,29.7,1,1,True,5,woman,4.100989
571,1,1,female,53.0,2,0,51.4792,S,First,woman,...,False,Adult,3,17.159733,1,1,True,5,woman,3.960417


In [74]:
#  First class children
df[(df['class'] == 'First') & (df['age'] < 12)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
297,0,1,female,2.0,1,2,151.55,S,First,child,...,False,Child,4,37.8875,1,1,False,5,child,5.027492
305,1,1,male,0.92,1,2,151.55,S,First,child,...,False,Child,4,37.8875,0,1,True,5,child,5.027492
445,1,1,male,4.0,0,2,81.8583,S,First,child,...,False,Child,3,27.2861,0,1,True,5,child,4.417132
802,1,1,male,11.0,1,2,120.0,S,First,child,...,False,Child,4,30.0,0,1,True,5,child,4.795791


In [75]:
#  Survived elderly
df[(df['age'] > 60) & (df['survived'] == 1)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
275,1,1,female,63.0,1,0,77.9583,S,First,woman,...,True,Senior,2,38.97915,1,1,True,5,woman,4.36892
483,1,3,female,63.0,0,0,9.5875,S,Third,woman,...,True,Senior,1,9.5875,1,3,True,5,woman,2.359674
570,1,2,male,62.0,0,0,10.5,S,Second,man,...,True,Senior,1,10.5,0,2,True,3,man,2.442347
630,1,1,male,80.0,0,0,30.0,S,First,man,...,True,Senior,1,30.0,0,1,True,3,man,3.433987
829,1,1,female,62.0,0,0,80.0,S,First,woman,...,True,Senior,1,80.0,1,1,True,5,woman,4.394449


In [76]:
#  Rich (fare > 100)
df[df['fare'] > 100]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
27,0,1,male,19.0,3,2,263.0,S,First,man,...,False,Adult,6,43.833333,0,1,False,3,man,5.575949
31,1,1,female,28.0,1,0,146.5208,C,First,woman,...,False,Adult,2,73.2604,1,1,True,5,woman,4.993969
88,1,1,female,23.0,3,2,263.0,S,First,woman,...,False,Adult,6,43.833333,1,1,True,5,woman,5.575949
118,0,1,male,24.0,0,1,247.5208,C,First,man,...,False,Adult,2,123.7604,0,1,False,3,man,5.515527
195,1,1,female,58.0,0,0,146.5208,C,First,woman,...,False,Adult,1,146.5208,1,1,True,5,woman,4.993969
215,1,1,female,31.0,1,0,113.275,C,First,woman,...,False,Adult,2,56.6375,1,1,True,5,woman,4.738608
258,1,1,female,35.0,0,0,512.3292,C,First,woman,...,False,Adult,1,512.3292,1,1,True,5,woman,6.240917
268,1,1,female,58.0,0,1,153.4625,S,First,woman,...,False,Adult,2,76.73125,1,1,True,5,woman,5.039951
269,1,1,female,35.0,0,0,135.6333,S,First,woman,...,False,Adult,1,135.6333,1,1,True,5,woman,4.917301
297,0,1,female,2.0,1,2,151.55,S,First,child,...,False,Child,4,37.8875,1,1,False,5,child,5.027492


In [77]:
#  Third class and low fare
df[(df['class'] == 'Third') & (df['fare'] < 10)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,False,Adult,2,3.6250,0,3,False,3,man,2.110213
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,False,Adult,1,7.9250,1,3,True,5,woman,2.188856
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,False,Adult,1,8.0500,0,3,False,3,man,2.202765
5,0,3,male,28.0,0,0,8.4583,Q,Third,man,...,False,Adult,1,8.4583,0,3,False,3,man,2.246893
12,0,3,male,20.0,0,0,8.0500,S,Third,man,...,False,Adult,1,8.0500,0,3,False,3,man,2.202765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
877,0,3,male,19.0,0,0,7.8958,S,Third,man,...,False,Adult,1,7.8958,0,3,False,3,man,2.185579
878,0,3,male,28.0,0,0,7.8958,S,Third,man,...,False,Adult,1,7.8958,0,3,False,3,man,2.185579
881,0,3,male,33.0,0,0,7.8958,S,Third,man,...,False,Adult,1,7.8958,0,3,False,3,man,2.185579
884,0,3,male,25.0,0,0,7.0500,S,Third,man,...,False,Adult,1,7.0500,0,3,False,3,man,2.085672


In [78]:
#  Embarked at Cherbourg
df[df['embarked'] == 'C']

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.64165,1,1,True,5,woman,4.280593
9,1,2,female,14.0,1,0,30.0708,C,Second,child,...,False,Teen,2,15.03540,1,2,True,5,child,3.436268
19,1,3,female,28.0,0,0,7.2250,C,Third,woman,...,False,Adult,1,7.22500,1,3,True,5,woman,2.107178
26,0,3,male,28.0,0,0,7.2250,C,Third,man,...,False,Adult,1,7.22500,0,3,False,3,man,2.107178
30,0,1,male,40.0,0,0,27.7208,C,First,man,...,False,Adult,1,27.72080,0,1,False,3,man,3.357622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,1,2,female,27.0,1,0,13.8583,C,Second,woman,...,False,Adult,2,6.92915,1,2,True,5,woman,2.698559
874,1,2,female,28.0,1,0,24.0000,C,Second,woman,...,False,Adult,2,12.00000,1,2,True,5,woman,3.218876
875,1,3,female,15.0,0,0,7.2250,C,Third,child,...,False,Teen,1,7.22500,1,3,True,5,child,2.107178
879,1,1,female,56.0,0,1,83.1583,C,First,woman,...,False,Adult,2,41.57915,1,1,True,5,woman,4.432700


In [79]:
#  Not embarked at Southampton
df[df['embarked'] != 'S']

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.641650,1,1,True,5,woman,4.280593
5,0,3,male,28.0,0,0,8.4583,Q,Third,man,...,False,Adult,1,8.458300,0,3,False,3,man,2.246893
9,1,2,female,14.0,1,0,30.0708,C,Second,child,...,False,Teen,2,15.035400,1,2,True,5,child,3.436268
16,0,3,male,2.0,4,1,29.1250,Q,Third,child,...,False,Child,6,4.854167,0,3,False,5,child,3.405355
19,1,3,female,28.0,0,0,7.2250,C,Third,woman,...,False,Adult,1,7.225000,1,3,True,5,woman,2.107178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,...,False,Teen,1,7.225000,1,3,True,5,child,2.107178
879,1,1,female,56.0,0,1,83.1583,C,First,woman,...,False,Adult,2,41.579150,1,1,True,5,woman,4.432700
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,...,False,Adult,6,4.854167,1,3,False,5,woman,3.405355
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,False,Adult,1,30.000000,0,1,True,3,man,3.433987


In [80]:
#  Children who died
df[(df['age'] < 12) & (df['survived'] == 0)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
7,0,3,male,2.0,3,1,21.075,S,Third,child,...,False,Child,5,4.215,0,3,False,5,child,3.094446
16,0,3,male,2.0,4,1,29.125,Q,Third,child,...,False,Child,6,4.854167,0,3,False,5,child,3.405355
24,0,3,female,8.0,3,1,21.075,S,Third,child,...,False,Child,5,4.215,1,3,False,5,child,3.094446
50,0,3,male,7.0,4,1,39.6875,S,Third,child,...,False,Child,6,6.614583,0,3,False,5,child,3.705921
59,0,3,male,11.0,5,2,46.9,S,Third,child,...,False,Child,8,5.8625,0,3,False,5,child,3.869116
63,0,3,male,4.0,3,2,27.9,S,Third,child,...,False,Child,6,4.65,0,3,False,5,child,3.363842
119,0,3,female,2.0,4,2,31.275,S,Third,child,...,False,Child,7,4.467857,1,3,False,5,child,3.474293
147,0,3,female,9.0,2,2,34.375,S,Third,child,...,False,Child,5,6.875,1,3,False,5,child,3.566005
164,0,3,male,1.0,4,1,39.6875,S,Third,child,...,False,Child,6,6.614583,0,3,False,5,child,3.705921
171,0,3,male,4.0,4,1,29.125,Q,Third,child,...,False,Child,6,4.854167,0,3,False,5,child,3.405355


In [81]:
#  Elders who survived
df[(df['age'] > 60) & (df['survived'] == 1)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
275,1,1,female,63.0,1,0,77.9583,S,First,woman,...,True,Senior,2,38.97915,1,1,True,5,woman,4.36892
483,1,3,female,63.0,0,0,9.5875,S,Third,woman,...,True,Senior,1,9.5875,1,3,True,5,woman,2.359674
570,1,2,male,62.0,0,0,10.5,S,Second,man,...,True,Senior,1,10.5,0,2,True,3,man,2.442347
630,1,1,male,80.0,0,0,30.0,S,First,man,...,True,Senior,1,30.0,0,1,True,3,man,3.433987
829,1,1,female,62.0,0,0,80.0,S,First,woman,...,True,Senior,1,80.0,1,1,True,5,woman,4.394449


#### Value Counts & Crosstabs

In [82]:
#  Count of passengers by sex
df['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [83]:
#  Count of survival by class
pd.crosstab(df['class'], df['survived'])

survived,0,1
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,80,136
Second,97,87
Third,372,119


In [84]:
#  Survival by sex
pd.crosstab(df['sex'], df['survived'])

survived,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [85]:
#  Sex & class crosstab
pd.crosstab(df['sex'], df['class'])

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,94,76,144
male,122,108,347


In [86]:
#  Survived by embark_town
pd.crosstab(df['embark_town'], df['survived'])

survived,0,1
embark_town,Unnamed: 1_level_1,Unnamed: 2_level_1
Cherbourg,75,93
Queenstown,47,30
Southampton,427,217
Unknown,0,2


In [87]:
#  Normalize crosstab
pd.crosstab(df['sex'], df['survived'], normalize='index')

survived,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.257962,0.742038
male,0.811092,0.188908


In [88]:
#  Percent survived by class
df.groupby('class')['survived'].mean() * 100

class
First     62.962963
Second    47.282609
Third     24.236253
Name: survived, dtype: float64

In [89]:
#  Who category counts
df['who'].value_counts()


who
man      537
woman    271
child     83
Name: count, dtype: int64

In [90]:
#  Survival by age group
pd.crosstab(df['age_group'], df['survived'])

survived,0,1
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
Child,29,40
Teen,40,30
Adult,463,267
Senior,17,5


In [91]:
#  Count unique ages
df['age'].nunique()

88

#### Advanced Filtering & Sampling

In [94]:
# 80. Count unique ages
df['age'].nunique()

88

In [93]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,False,Adult,2,3.62500,0,3,False,3,man,2.110213
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.64165,1,1,True,5,woman,4.280593
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,False,Adult,1,7.92500,1,3,True,5,woman,2.188856
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,False,Adult,2,26.55000,1,1,True,5,woman,3.990834
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,False,Adult,1,8.05000,0,3,False,3,man,2.202765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,False,Adult,1,13.00000,0,2,False,3,man,2.639057
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,False,Adult,1,30.00000,1,1,True,5,woman,3.433987
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,False,Adult,4,5.86250,1,3,False,5,woman,3.196630
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,False,Adult,1,30.00000,0,1,True,3,man,3.433987


In [None]:
#  Drop duplicate sex entries (demonstration)
df.drop_duplicates(subset='sex')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,False,Adult,2,3.625,0,3,False,3,man,2.110213
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.64165,1,1,True,5,woman,4.280593


In [96]:
# Sample 10 random rows
df.sample(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
95,0,3,male,28.0,0,0,8.05,S,Third,man,...,False,Adult,1,8.05,0,3,False,3,man,2.202765
544,0,1,male,50.0,1,0,106.425,C,First,man,...,False,Adult,2,53.2125,0,1,False,3,man,4.676793
821,1,3,male,27.0,0,0,8.6625,S,Third,man,...,False,Adult,1,8.6625,0,3,True,3,man,2.268252
115,0,3,male,21.0,0,0,7.925,S,Third,man,...,False,Adult,1,7.925,0,3,False,3,man,2.188856
699,0,3,male,42.0,0,0,7.65,S,Third,man,...,False,Adult,1,7.65,0,3,False,3,man,2.157559
406,0,3,male,51.0,0,0,7.75,S,Third,man,...,False,Adult,1,7.75,0,3,False,3,man,2.169054
120,0,2,male,21.0,2,0,73.5,S,Second,man,...,False,Adult,3,24.5,0,2,False,3,man,4.310799
60,0,3,male,22.0,0,0,7.2292,C,Third,man,...,False,Adult,1,7.2292,0,3,False,3,man,2.107689
657,0,3,female,32.0,1,1,15.5,Q,Third,woman,...,False,Adult,3,5.166667,1,3,False,5,woman,2.80336
682,0,3,male,20.0,0,0,9.225,S,Third,man,...,False,Adult,1,9.225,0,3,False,3,man,2.324836


In [97]:
#  Sample 5% of data
df.sample(frac=0.05)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
316,1,2,female,24.0,1,0,26.0,S,Second,woman,...,False,Adult,2,13.0,1,2,True,5,woman,3.295837
845,0,3,male,42.0,0,0,7.55,S,Third,man,...,False,Adult,1,7.55,0,3,False,3,man,2.145931
552,0,3,male,28.0,0,0,7.8292,Q,Third,man,...,False,Adult,1,7.8292,0,3,False,3,man,2.178064
401,0,3,male,26.0,0,0,8.05,S,Third,man,...,False,Adult,1,8.05,0,3,False,3,man,2.202765
275,1,1,female,63.0,1,0,77.9583,S,First,woman,...,True,Senior,2,38.97915,1,1,True,5,woman,4.36892
319,1,1,female,40.0,1,1,134.5,C,First,woman,...,False,Adult,3,44.833333,1,1,True,5,woman,4.908972
659,0,1,male,58.0,0,2,113.275,C,First,man,...,False,Adult,3,37.758333,0,1,False,3,man,4.738608
192,1,3,female,19.0,1,0,7.8542,S,Third,woman,...,False,Adult,2,3.9271,1,3,True,5,woman,2.180892
42,0,3,male,28.0,0,0,7.8958,C,Third,man,...,False,Adult,1,7.8958,0,3,False,3,man,2.185579
451,0,3,male,28.0,1,0,19.9667,S,Third,man,...,False,Adult,2,9.98335,0,3,False,3,man,3.042935


In [98]:
#  Filter rows where 'who' contains "man"
df[df['who'].str.contains('man', na=False)]


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,False,Adult,2,3.62500,0,3,False,3,man,2.110213
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.64165,1,1,True,5,woman,4.280593
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,False,Adult,1,7.92500,1,3,True,5,woman,2.188856
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,False,Adult,2,26.55000,1,1,True,5,woman,3.990834
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,False,Adult,1,8.05000,0,3,False,3,man,2.202765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,False,Adult,1,13.00000,0,2,False,3,man,2.639057
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,False,Adult,1,30.00000,1,1,True,5,woman,3.433987
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,False,Adult,4,5.86250,1,3,False,5,woman,3.196630
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,False,Adult,1,30.00000,0,1,True,3,man,3.433987


In [99]:
#  Age between 20 and 40
df[df['age'].between(20, 40)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,False,Adult,2,3.625000,0,3,False,3,man,2.110213
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.641650,1,1,True,5,woman,4.280593
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,False,Adult,1,7.925000,1,3,True,5,woman,2.188856
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,False,Adult,2,26.550000,1,1,True,5,woman,3.990834
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,False,Adult,1,8.050000,0,3,False,3,man,2.202765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,...,False,Adult,6,4.854167,1,3,False,5,woman,3.405355
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,False,Adult,1,13.000000,0,2,False,3,man,2.639057
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,False,Adult,4,5.862500,1,3,False,5,woman,3.196630
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,False,Adult,1,30.000000,0,1,True,3,man,3.433987


In [100]:
#  Top 5 passengers by fare (show sex and fare)
df.nlargest(5, 'fare')[['sex', 'fare']]

Unnamed: 0,sex,fare
258,female,512.3292
679,male,512.3292
737,male,512.3292
27,male,263.0
88,female,263.0


In [101]:
#  Top 5 oldest passengers (show sex and age)
df.nlargest(5, 'age')[['sex', 'age']]

Unnamed: 0,sex,age
630,male,80.0
851,male,74.0
96,male,71.0
493,male,71.0
116,male,70.5


In [102]:
#  Select survivors with family onboard
df[(df['survived'] == 1) & (df['sibsp'] + df['parch'] > 0)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.641650,1,1,True,5,woman,4.280593
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,False,Adult,2,26.550000,1,1,True,5,woman,3.990834
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,...,False,Adult,3,3.711100,1,3,True,5,woman,2.495954
9,1,2,female,14.0,1,0,30.0708,C,Second,child,...,False,Teen,2,15.035400,1,2,True,5,child,3.436268
10,1,3,female,4.0,1,1,16.7000,S,Third,child,...,False,Child,3,5.566667,1,3,True,5,child,2.873565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,1,3,male,4.0,1,1,11.1333,S,Third,child,...,False,Child,3,3.711100,0,3,True,5,child,2.495954
871,1,1,female,47.0,1,1,52.5542,S,First,woman,...,False,Adult,3,17.518067,1,1,True,5,woman,3.980694
874,1,2,female,28.0,1,0,24.0000,C,Second,woman,...,False,Adult,2,12.000000,1,2,True,5,woman,3.218876
879,1,1,female,56.0,0,1,83.1583,C,First,woman,...,False,Adult,2,41.579150,1,1,True,5,woman,4.432700


In [103]:
# Survivors in first class
df[(df['class'] == 'First') & (df['survived'] == 1)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.641650,1,1,True,5,woman,4.280593
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,False,Adult,2,26.550000,1,1,True,5,woman,3.990834
11,1,1,female,58.0,0,0,26.5500,S,First,woman,...,False,Adult,1,26.550000,1,1,True,5,woman,3.316003
23,1,1,male,28.0,0,0,35.5000,S,First,man,...,False,Adult,1,35.500000,0,1,True,3,man,3.597312
31,1,1,female,28.0,1,0,146.5208,C,First,woman,...,False,Adult,2,73.260400,1,1,True,5,woman,4.993969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,1,1,female,48.0,0,0,25.9292,S,First,woman,...,False,Adult,1,25.929200,1,1,True,5,woman,3.293211
871,1,1,female,47.0,1,1,52.5542,S,First,woman,...,False,Adult,3,17.518067,1,1,True,5,woman,3.980694
879,1,1,female,56.0,0,1,83.1583,C,First,woman,...,False,Adult,2,41.579150,1,1,True,5,woman,4.432700
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,False,Adult,1,30.000000,1,1,True,5,woman,3.433987


In [104]:
#  Multiple conditions: male, 2nd class, survived
df[(df['sex'] == 'male') & (df['class'] == 'Second') & (df['survived'] == 1)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
17,1,2,male,28.0,0,0,13.0,S,Second,man,...,False,Adult,1,13.0,0,2,True,3,man,2.639057
21,1,2,male,34.0,0,0,13.0,S,Second,man,...,False,Adult,1,13.0,0,2,True,3,man,2.639057
78,1,2,male,0.83,0,2,29.0,S,Second,child,...,False,Child,3,9.666667,0,2,True,5,child,3.401197
183,1,2,male,1.0,2,1,39.0,S,Second,child,...,False,Child,4,9.75,0,2,True,5,child,3.688879
193,1,2,male,3.0,1,1,26.0,S,Second,child,...,False,Child,3,8.666667,0,2,True,5,child,3.295837
226,1,2,male,19.0,0,0,10.5,S,Second,man,...,False,Adult,1,10.5,0,2,True,3,man,2.442347
288,1,2,male,42.0,0,0,13.0,S,Second,man,...,False,Adult,1,13.0,0,2,True,3,man,2.639057
340,1,2,male,2.0,1,1,26.0,S,Second,child,...,False,Child,3,8.666667,0,2,True,5,child,3.295837
407,1,2,male,3.0,1,1,18.75,S,Second,child,...,False,Child,3,6.25,0,2,True,5,child,2.983153
543,1,2,male,32.0,1,0,26.0,S,Second,man,...,False,Adult,2,13.0,0,2,True,3,man,3.295837


#### Exporting and Final Tweaks

In [105]:
#  Save to CSV
df.to_csv('titanic_clean.csv', index=False)

In [106]:
#  Save to Excel
df.to_excel('titanic_clean.xlsx', index=False)

In [107]:
#  Save to JSON
df.to_json('titanic_clean.json')

In [108]:
#  Rename 'fare' to 'ticket_fare'
df.rename(columns={'fare': 'ticket_fare'}, inplace=True)

In [109]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket_fare,embarked,class,who,...,is_elderly,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,False,Adult,2,3.625,0,3,False,3,man,2.110213
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,False,Adult,2,35.64165,1,1,True,5,woman,4.280593
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,False,Adult,1,7.925,1,3,True,5,woman,2.188856
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,False,Adult,2,26.55,1,1,True,5,woman,3.990834
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,False,Adult,1,8.05,0,3,False,3,man,2.202765


In [None]:
#  Create family size
df['family_size'] = df['sibsp'] + df['parch']

In [112]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket_fare,embarked,class,who,...,age_group,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare,family_size
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,Adult,2,3.62500,0,3,False,3,man,2.110213,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,Adult,2,35.64165,1,1,True,5,woman,4.280593,1
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,Adult,1,7.92500,1,3,True,5,woman,2.188856,0
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,Adult,2,26.55000,1,1,True,5,woman,3.990834,1
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,Adult,1,8.05000,0,3,False,3,man,2.202765,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,Adult,1,13.00000,0,2,False,3,man,2.639057,0
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,Adult,1,30.00000,1,1,True,5,woman,3.433987,0
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,Adult,4,5.86250,1,3,False,5,woman,3.196630,3
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,Adult,1,30.00000,0,1,True,3,man,3.433987,0


In [113]:
#  Boolean: is_alone
df['is_alone'] = df['family_size'] == 0

In [114]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket_fare,embarked,class,who,...,total_people,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare,family_size,is_alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,2,3.62500,0,3,False,3,man,2.110213,1,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,2,35.64165,1,1,True,5,woman,4.280593,1,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,1,7.92500,1,3,True,5,woman,2.188856,0,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,2,26.55000,1,1,True,5,woman,3.990834,1,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,1,8.05000,0,3,False,3,man,2.202765,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,1,13.00000,0,2,False,3,man,2.639057,0,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,1,30.00000,1,1,True,5,woman,3.433987,0,True
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,4,5.86250,1,3,False,5,woman,3.196630,3,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,1,30.00000,0,1,True,3,man,3.433987,0,True


In [115]:
#  Map embarked to full names
df['embarked_full'] = df['embarked'].map({'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown'})

In [116]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket_fare,embarked,class,who,...,fare_per_person,sex_encoded,class_num,survived_bool,name_length,title,log_fare,family_size,is_alone,embarked_full
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,3.62500,0,3,False,3,man,2.110213,1,False,Southampton
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,35.64165,1,1,True,5,woman,4.280593,1,False,Cherbourg
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,7.92500,1,3,True,5,woman,2.188856,0,True,Southampton
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,26.55000,1,1,True,5,woman,3.990834,1,False,Southampton
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,8.05000,0,3,False,3,man,2.202765,0,True,Southampton
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,13.00000,0,2,False,3,man,2.639057,0,True,Southampton
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,30.00000,1,1,True,5,woman,3.433987,0,True,Southampton
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,5.86250,1,3,False,5,woman,3.196630,3,False,Southampton
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,30.00000,0,1,True,3,man,3.433987,0,True,Cherbourg


In [118]:
# 99. Fare rounded
df['fare_rounded'] = df['ticket_fare'].round(2)


In [119]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket_fare,embarked,class,who,...,sex_encoded,class_num,survived_bool,name_length,title,log_fare,family_size,is_alone,embarked_full,fare_rounded
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,0,3,False,3,man,2.110213,1,False,Southampton,7.25
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,1,1,True,5,woman,4.280593,1,False,Cherbourg,71.28
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,1,3,True,5,woman,2.188856,0,True,Southampton,7.92
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,1,1,True,5,woman,3.990834,1,False,Southampton,53.10
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,0,3,False,3,man,2.202765,0,True,Southampton,8.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,0,2,False,3,man,2.639057,0,True,Southampton,13.00
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,1,1,True,5,woman,3.433987,0,True,Southampton,30.00
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,1,3,False,5,woman,3.196630,3,False,Southampton,23.45
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,0,1,True,3,man,3.433987,0,True,Cherbourg,30.00


In [122]:

df.to_csv('titanic_updated.csv', index=False)
df.to_excel('titanic_updated.xlsx', index=False)


In [121]:
# 100. Drop all rows with missing age or fare
df.dropna(subset=['age', 'ticket_fare'])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket_fare,embarked,class,who,...,sex_encoded,class_num,survived_bool,name_length,title,log_fare,family_size,is_alone,embarked_full,fare_rounded
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,0,3,False,3,man,2.110213,1,False,Southampton,7.25
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,1,1,True,5,woman,4.280593,1,False,Cherbourg,71.28
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,1,3,True,5,woman,2.188856,0,True,Southampton,7.92
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,1,1,True,5,woman,3.990834,1,False,Southampton,53.10
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,0,3,False,3,man,2.202765,0,True,Southampton,8.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,0,2,False,3,man,2.639057,0,True,Southampton,13.00
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,1,1,True,5,woman,3.433987,0,True,Southampton,30.00
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,1,3,False,5,woman,3.196630,3,False,Southampton,23.45
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,0,1,True,3,man,3.433987,0,True,Cherbourg,30.00
