In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df_titanic = pd.read_csv("Titanic-Dataset.csv")

In [3]:
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


<h1>Data Cleaning and Preparation</h1>

<h4>Filtering Out Missing Data</h4>

In [6]:
df_titanic.isna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
df_titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
df_titanic_test = df_titanic.dropna()

In [9]:
df_titanic_test.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [10]:
(df_titanic["Cabin"].notna()).sum()
df_titanic[df_titanic["Cabin"].notna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [11]:
df_titanic[df_titanic.notna().all(axis = 1)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [12]:
df_titanic["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [13]:
df_titanic["Embarked"].value_counts(sort = False).sort_index()

Embarked
C    168
Q     77
S    644
Name: count, dtype: int64

In [14]:
df_titanic[df_titanic["Embarked"].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [15]:
df_titanic.dropna(axis = "columns", how = "all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [16]:
df_titanic.dropna(axis = "columns", how = "any")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.2500
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.9250
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1000
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.0500
...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,0,0,211536,13.0000
887,888,1,1,"Graham, Miss. Margaret Edith",female,0,0,112053,30.0000
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,1,2,W./C. 6607,23.4500
889,890,1,1,"Behr, Mr. Karl Howell",male,0,0,111369,30.0000


In [17]:
df_titanic.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [18]:
df_titanic.iloc[[0,1,8], [0,8]]

Unnamed: 0,PassengerId,Ticket
0,1,A/5 21171
1,2,PC 17599
8,9,347742


<h3>Filling In Missing Data</h3>

In [19]:
df_titanic.fillna(0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,0,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,0,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.4500,0,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [20]:
df_titanic.fillna({"Age":0})

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [21]:
df_titanic.fillna(method = "ffill").tail()

  df_titanic.fillna(method = "ffill").tail()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,C50,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,19.0,1,2,W./C. 6607,23.45,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,C148,Q


In [22]:
df_titanic.fillna(df_titanic.bfill()).tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,B42,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,26.0,1,2,W./C. 6607,23.45,C148,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [23]:
df_titanic["Age"].fillna(df_titanic["Age"].mean()).tail()

886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, dtype: float64

In [24]:
df_titanic["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

<h3>Removing Duplicates</h3>

In [25]:
df_titanic.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

In [26]:
df_titanic.duplicated().sum()

0

In [27]:
df_titanic["test_duplicate"] = range(len(df_titanic))

In [28]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,test_duplicate
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,886
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,887
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,888
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,889


In [29]:
del df_titanic["test_duplicate"]

In [30]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [31]:
dupes = df_titanic.head(3)       
df_dup = pd.concat([df_titanic, dupes], ignore_index=True)

In [32]:
df_dup

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q
891,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
892,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [33]:
df_dup.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
889    False
890    False
891     True
892     True
893     True
Length: 894, dtype: bool

In [34]:
df_dup[df_dup.duplicated(subset = ["PassengerId"])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
891,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
892,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
893,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [35]:
df_dup[df_dup.duplicated()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
891,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
892,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
893,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [36]:
df_dup.drop_duplicates()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [37]:
df_dup.drop_duplicates(subset =["PassengerId"])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [38]:
df_dup

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q
891,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
892,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


<h3>Transforming Data Using a Function or Mapping</h3>

In [39]:
embarked = {
    "S":"Southampton",
    "C":"Cherbourg",
    "Q":"Queenstown"
}

In [40]:
df_titanic["Embarked"] = df_titanic["Embarked"].map(embarked)

In [41]:
df_titanic[:8]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,Southampton
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,Cherbourg
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,Southampton
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,Southampton
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,Southampton
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Queenstown
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,Southampton
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,Southampton


<h3>Replacing Values</h3>

In [42]:
df_titanic.replace(["Southampton", "Cherbourg", "Queenstown"], ["S", "C", "Q"])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [43]:
df_titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [44]:
df_titanic.index

RangeIndex(start=0, stop=891, step=1)

In [45]:
df_titanic.rename(columns = str.upper)

Unnamed: 0,PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,Southampton
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,Cherbourg
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,Southampton
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,Southampton
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,Southampton
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,Southampton
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,Southampton
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,Southampton
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,Cherbourg


In [46]:
df_titanic = df_titanic.dropna(subset = "Embarked")

In [47]:
df_titanic = df_titanic.set_index(["Embarked"])

In [48]:
df_titanic

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Southampton,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,
Cherbourg,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85
Southampton,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,
Southampton,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123
Southampton,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,
...,...,...,...,...,...,...,...,...,...,...,...
Southampton,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,
Southampton,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42
Southampton,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,
Cherbourg,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148


In [49]:
df_titanic.index

Index(['Southampton', 'Cherbourg', 'Southampton', 'Southampton', 'Southampton',
       'Queenstown', 'Southampton', 'Southampton', 'Southampton', 'Cherbourg',
       ...
       'Southampton', 'Southampton', 'Southampton', 'Southampton',
       'Queenstown', 'Southampton', 'Southampton', 'Southampton', 'Cherbourg',
       'Queenstown'],
      dtype='object', name='Embarked', length=889)

In [50]:
df_titanic.rename(index = str.upper)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SOUTHAMPTON,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,
CHERBOURG,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85
SOUTHAMPTON,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,
SOUTHAMPTON,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123
SOUTHAMPTON,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,
...,...,...,...,...,...,...,...,...,...,...,...
SOUTHAMPTON,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,
SOUTHAMPTON,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42
SOUTHAMPTON,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,
CHERBOURG,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148


In [51]:
def transform(x):
    return x[:5].upper()

In [52]:
df_titanic.index.map(transform)

Index(['SOUTH', 'CHERB', 'SOUTH', 'SOUTH', 'SOUTH', 'QUEEN', 'SOUTH', 'SOUTH',
       'SOUTH', 'CHERB',
       ...
       'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'QUEEN', 'SOUTH', 'SOUTH', 'SOUTH',
       'CHERB', 'QUEEN'],
      dtype='object', name='Embarked', length=889)

In [53]:
df_titanic.index = df_titanic.index.map(transform)

In [54]:
df_titanic

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SOUTH,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,
CHERB,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85
SOUTH,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,
SOUTH,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123
SOUTH,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,
...,...,...,...,...,...,...,...,...,...,...,...
SOUTH,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,
SOUTH,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42
SOUTH,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,
CHERB,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148


In [55]:
df_titanic.reset_index(inplace=True)

In [56]:
df_titanic

Unnamed: 0,Embarked,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
0,SOUTH,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,
1,CHERB,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85
2,SOUTH,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,
3,SOUTH,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123
4,SOUTH,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,
...,...,...,...,...,...,...,...,...,...,...,...,...
884,SOUTH,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,
885,SOUTH,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42
886,SOUTH,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,
887,CHERB,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148


<h3>Discretization and Binning
</h3>

In [57]:
bins = [0, 18, 35, 50, 100]
labels = ["Child", "Young", "Adult", "Senior"]
df_titanic["AgeGroup"]= pd.cut(df_titanic["Age"], bins = bins, labels = labels)

In [58]:
df_titanic[["Age", "AgeGroup"]].head(10)

Unnamed: 0,Age,AgeGroup
0,22.0,Young
1,38.0,Adult
2,26.0,Young
3,35.0,Young
4,35.0,Young
5,,
6,54.0,Senior
7,2.0,Child
8,27.0,Young
9,14.0,Child


In [59]:
age_categories= pd.cut(df_titanic["Age"], bins = bins)

In [60]:
age_categories

0      (18.0, 35.0]
1      (35.0, 50.0]
2      (18.0, 35.0]
3      (18.0, 35.0]
4      (18.0, 35.0]
           ...     
884    (18.0, 35.0]
885    (18.0, 35.0]
886             NaN
887    (18.0, 35.0]
888    (18.0, 35.0]
Name: Age, Length: 889, dtype: category
Categories (4, interval[int64, right]): [(0, 18] < (18, 35] < (35, 50] < (50, 100]]

In [61]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


Age
(18, 35]     358
(35, 50]     152
(0, 18]      139
(50, 100]     63
Name: count, dtype: int64

In [62]:
df_titanic["AgeGroup"].value_counts()

AgeGroup
Young     358
Adult     152
Child     139
Senior     63
Name: count, dtype: int64

In [63]:
df_titanic["FareBin"] = pd.qcut(df_titanic["Fare"], 4, labels = ["Low", "Medium", "High", "Luxury"])

In [64]:
df_titanic[["FareBin", "Fare"]].head()

Unnamed: 0,FareBin,Fare
0,Low,7.25
1,Luxury,71.2833
2,Medium,7.925
3,Luxury,53.1
4,Medium,8.05


In [65]:
df_titanic["FareBin"].value_counts()

FareBin
Medium    224
Low       223
High      222
Luxury    220
Name: count, dtype: int64

In [66]:
df_titanic.pivot_table(index  = "AgeGroup", values = "Survived", aggfunc = "mean")

Unnamed: 0_level_0,Survived
AgeGroup,Unnamed: 1_level_1
Child,0.503597
Young,0.382682
Adult,0.394737
Senior,0.333333


In [67]:
df_titanic.pivot_table(index = "FareBin", values = "Survived", aggfunc = "mean")

Unnamed: 0_level_0,Survived
FareBin,Unnamed: 1_level_1
Low,0.197309
Medium,0.303571
High,0.454955
Luxury,0.577273


<h3>Detecting and Filtering Outliers</h3>

In [68]:
df_titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,889.0,889.0,889.0,712.0,889.0,889.0,889.0
mean,446.0,0.382452,2.311586,29.642093,0.524184,0.382452,32.096681
std,256.998173,0.48626,0.8347,14.492933,1.103705,0.806761,49.697504
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,224.0,0.0,2.0,20.0,0.0,0.0,7.8958
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.0,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [69]:
df_titanic

Unnamed: 0,Embarked,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin
0,SOUTH,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,Young,Low
1,CHERB,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,Adult,Luxury
2,SOUTH,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,Young,Medium
3,SOUTH,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,Young,Luxury
4,SOUTH,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,Young,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,SOUTH,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,Young,Medium
885,SOUTH,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,Young,High
886,SOUTH,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,,High
887,CHERB,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,Young,High


In [70]:
df_titanic.sample(n=3)

Unnamed: 0,Embarked,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin
13,SOUTH,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,,Adult,Luxury
568,SOUTH,570,1,3,"Jonsson, Mr. Carl",male,32.0,0,0,350417,7.8542,,Young,Low
800,SOUTH,802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,,Young,High


<h3>Computing Indicator/Dummy Variables
</h3>

In [71]:
dummies = pd.get_dummies(df_titanic["Embarked"], dtype = float)
df_with_dummy = df_titanic[["Ticket"]].join(dummies)
df_with_dummy

Unnamed: 0,Ticket,CHERB,QUEEN,SOUTH
0,A/5 21171,0.0,0.0,1.0
1,PC 17599,1.0,0.0,0.0
2,STON/O2. 3101282,0.0,0.0,1.0
3,113803,0.0,0.0,1.0
4,373450,0.0,0.0,1.0
...,...,...,...,...
884,211536,0.0,0.0,1.0
885,112053,0.0,0.0,1.0
886,W./C. 6607,0.0,0.0,1.0
887,111369,1.0,0.0,0.0


In [72]:
df_titanic["fname"] = df_titanic["Name"].str.split(",").str[0]

In [73]:
df_titanic.head()

Unnamed: 0,Embarked,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname
0,SOUTH,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,Young,Low,Braund
1,CHERB,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,Adult,Luxury,Cumings
2,SOUTH,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,Young,Medium,Heikkinen
3,SOUTH,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,Young,Luxury,Futrelle
4,SOUTH,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,Young,Medium,Allen


In [74]:
df_titanic["fname"].value_counts()

fname
Andersson    9
Sage         7
Skoog        6
Carter       6
Johnson      6
            ..
Hanna        1
Lewy         1
Mineff       1
Haas         1
Dooley       1
Name: count, Length: 665, dtype: int64

In [75]:
df_titanic["fname"] = df_titanic["fname"].str.upper()
df_titanic["fname"] = df_titanic["fname"].str.strip()
df_titanic["Sex"] = df_titanic["Sex"].str.replace("male", "M").str.replace("female", "F")

In [76]:
df_titanic.head(1)

Unnamed: 0,Embarked,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname
0,SOUTH,1,0,3,"Braund, Mr. Owen Harris",M,22.0,1,0,A/5 21171,7.25,,Young,Low,BRAUND


<h3>Regular Expressions
</h3>

In [77]:
import re

In [78]:
df_titanic["TicketNum"] = df_titanic["Ticket"].str.extract(r'(\d+)')
df_titanic["Title"] = df_titanic["Name"].str.extract(r'([A-Za-z]+)\.')

In [79]:
pattern = r'([A-Za-z]+)\.'
name = df_titanic["Name"].str.findall(pattern)
name

0        [Mr]
1       [Mrs]
2      [Miss]
3       [Mrs]
4        [Mr]
        ...  
884     [Rev]
885    [Miss]
886    [Miss]
887      [Mr]
888      [Mr]
Name: Name, Length: 889, dtype: object

In [80]:
df_titanic[df_titanic["Name"].str.contains(r'Mrs')].count()

Embarked       128
PassengerId    128
Survived       128
Pclass         128
Name           128
Sex            128
Age            111
SibSp          128
Parch          128
Ticket         128
Fare           128
Cabin           45
AgeGroup       111
FareBin        128
fname          128
TicketNum      128
Title          128
dtype: int64

In [81]:
(df_titanic["Name"].str.contains(r'Mrs')).sum()

128

<h3>Background and Motivation</h3>


In [82]:
pd.unique(df_titanic["Pclass"])

array([3, 1, 2], dtype=int64)

In [83]:
df_titanic["Ticket"].value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 680, dtype: int64

<h3>Categorical Extension Type in pandas
</h3>

In [84]:
age_category = df_titanic["AgeGroup"].astype('category')

In [85]:
age_category

0      Young
1      Adult
2      Young
3      Young
4      Young
       ...  
884    Young
885    Young
886      NaN
887    Young
888    Young
Name: AgeGroup, Length: 889, dtype: category
Categories (4, object): ['Child' < 'Young' < 'Adult' < 'Senior']

In [86]:
c = age_category.array
c

['Young', 'Adult', 'Young', 'Young', 'Young', ..., 'Young', 'Young', NaN, 'Young', 'Young']
Length: 889
Categories (4, object): ['Child' < 'Young' < 'Adult' < 'Senior']

In [87]:
type(c)

pandas.core.arrays.categorical.Categorical

In [88]:
c.categories

Index(['Child', 'Young', 'Adult', 'Senior'], dtype='object')

In [89]:
c.codes

array([ 1,  2,  1,  1,  1, -1,  3,  0,  1,  0,  0,  3,  1,  2,  0,  3,  0,
       -1,  1, -1,  1,  1,  0,  1,  0,  2, -1,  1, -1, -1,  2, -1, -1,  3,
        1,  2, -1,  1,  0,  0,  2,  1, -1,  0,  1, -1, -1, -1, -1,  0,  0,
        1,  2,  1,  3, -1,  1,  1,  0,  0,  1,  2,  0, -1, -1,  1,  1,  0,
        1,  1,  0,  1,  1,  1,  1, -1, -1,  0,  1,  1,  1, -1,  1,  0,  1,
        0, -1,  1,  1,  1,  1,  2,  1,  3, -1,  3,  1,  1,  1,  1, -1,  1,
        1,  2,  1,  1, -1,  2, -1,  2,  0,  1,  1,  0,  1,  3,  1,  1,  0,
        1, -1,  1,  1,  3,  0, -1,  1, -1,  2,  1,  1,  2,  1,  1,  1,  1,
        2,  0,  1, -1,  1,  1,  1,  0,  1,  1,  0,  2,  2,  3,  1,  3,  2,
       -1,  3,  0,  1, -1, -1,  2,  2,  1,  0,  0,  0, -1,  2, -1,  1,  3,
        0,  0,  1,  3,  0, -1,  2,  1,  2, -1, -1,  0,  0,  0, -1, -1,  2,
        2,  2,  1,  1,  1,  0,  2,  3, -1,  2, -1,  1,  1, -1,  1,  2,  0,
        0,  1,  1,  0,  2,  1,  1,  1,  1, -1,  1,  1,  2,  1,  1,  0,  1,
        3, -1,  2,  1,  1

In [90]:
dict(enumerate(c.categories))

{0: 'Child', 1: 'Young', 2: 'Adult', 3: 'Senior'}

In [91]:
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Embarked     889 non-null    object  
 1   PassengerId  889 non-null    int64   
 2   Survived     889 non-null    int64   
 3   Pclass       889 non-null    int64   
 4   Name         889 non-null    object  
 5   Sex          889 non-null    object  
 6   Age          712 non-null    float64 
 7   SibSp        889 non-null    int64   
 8   Parch        889 non-null    int64   
 9   Ticket       889 non-null    object  
 10  Fare         889 non-null    float64 
 11  Cabin        202 non-null    object  
 12  AgeGroup     712 non-null    category
 13  FareBin      889 non-null    category
 14  fname        889 non-null    object  
 15  TicketNum    885 non-null    object  
 16  Title        889 non-null    object  
dtypes: category(2), float64(2), int64(5), object(8)
memory usage: 106.4+ KB


<h1>Data Wrangling: Join, Combine, and Reshape</h1>

In [92]:
df_titanic.iloc[:3, :2]

Unnamed: 0,Embarked,PassengerId
0,SOUTH,1
1,CHERB,2
2,SOUTH,3


In [93]:
df_titanic[["Embarked"]]

Unnamed: 0,Embarked
0,SOUTH
1,CHERB
2,SOUTH
3,SOUTH
4,SOUTH
...,...
884,SOUTH
885,SOUTH
886,SOUTH
887,CHERB


In [94]:
df_titanic.unstack()

Embarked  0      SOUTH
          1      CHERB
          2      SOUTH
          3      SOUTH
          4      SOUTH
                 ...  
Title     884      Rev
          885     Miss
          886     Miss
          887       Mr
          888       Mr
Length: 15113, dtype: object

In [95]:
df_titanic.stack()

0    Embarked                         SOUTH
     PassengerId                          1
     Survived                             0
     Pclass                               3
     Name           Braund, Mr. Owen Harris
                             ...           
888  AgeGroup                         Young
     FareBin                            Low
     fname                           DOOLEY
     TicketNum                       370376
     Title                               Mr
Length: 14068, dtype: object

In [96]:
df_titanic.index

RangeIndex(start=0, stop=889, step=1)

In [97]:
df_titanic.columns

Index(['Embarked', 'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age',
       'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'AgeGroup', 'FareBin',
       'fname', 'TicketNum', 'Title'],
      dtype='object')

In [98]:
df_titanic.head(5)

Unnamed: 0,Embarked,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname,TicketNum,Title
0,SOUTH,1,0,3,"Braund, Mr. Owen Harris",M,22.0,1,0,A/5 21171,7.25,,Young,Low,BRAUND,5,Mr
1,CHERB,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",feM,38.0,1,0,PC 17599,71.2833,C85,Adult,Luxury,CUMINGS,17599,Mrs
2,SOUTH,3,1,3,"Heikkinen, Miss. Laina",feM,26.0,0,0,STON/O2. 3101282,7.925,,Young,Medium,HEIKKINEN,2,Miss
3,SOUTH,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",feM,35.0,1,0,113803,53.1,C123,Young,Luxury,FUTRELLE,113803,Mrs
4,SOUTH,5,0,3,"Allen, Mr. William Henry",M,35.0,0,0,373450,8.05,,Young,Medium,ALLEN,373450,Mr


In [99]:
df_titanic.index.nlevels

1

<h3>Hierarchical Indexing</h3>

In [100]:
titanic_mi = df_titanic.set_index(["Embarked", "Pclass", "Sex"]).swaplevel(0,1).sort_index(level = 0, ascending = True)
titanic_mi.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname,TicketNum,Title
Pclass,Embarked,Sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,CHERB,M,31,0,"Uruchurtu, Don. Manuel E",40.0,0,0,PC 17601,27.7208,,Adult,High,URUCHURTU,17601,Don
1,CHERB,M,35,0,"Meyer, Mr. Edgar Joseph",28.0,1,0,PC 17604,82.1708,,Young,Luxury,MEYER,17604,Mr
1,CHERB,M,55,0,"Ostby, Mr. Engelhart Cornelius",65.0,0,1,113509,61.9792,B30,Senior,Luxury,OSTBY,113509,Mr


<h3>Summary Statistics by Level
</h3>

In [101]:
titanic_group = df_titanic.groupby(["Embarked", "Pclass", "Sex"])["Survived"].mean()
titanic_group

Embarked  Pclass  Sex
CHERB     1       M      0.404762
                  feM    0.976744
          2       M      0.200000
                  feM    1.000000
          3       M      0.232558
                  feM    0.652174
QUEEN     1       M      0.000000
                  feM    1.000000
          2       M      0.000000
                  feM    1.000000
          3       M      0.076923
                  feM    0.727273
SOUTH     1       M      0.354430
                  feM    0.958333
          2       M      0.154639
                  feM    0.910448
          3       M      0.128302
                  feM    0.375000
Name: Survived, dtype: float64

In [102]:
wide = titanic_group.unstack("Sex")
wide

Unnamed: 0_level_0,Sex,M,feM
Embarked,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
CHERB,1,0.404762,0.976744
CHERB,2,0.2,1.0
CHERB,3,0.232558,0.652174
QUEEN,1,0.0,1.0
QUEEN,2,0.0,1.0
QUEEN,3,0.076923,0.727273
SOUTH,1,0.35443,0.958333
SOUTH,2,0.154639,0.910448
SOUTH,3,0.128302,0.375


In [103]:
wider = titanic_group.unstack(["Pclass", "Sex"])
wider

Pclass,1,1,2,2,3,3
Sex,M,feM,M,feM,M,feM
Embarked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
CHERB,0.404762,0.976744,0.2,1.0,0.232558,0.652174
QUEEN,0.0,1.0,0.0,1.0,0.076923,0.727273
SOUTH,0.35443,0.958333,0.154639,0.910448,0.128302,0.375


In [104]:
back = wide.stack(dropna = False)
back

Embarked  Pclass  Sex
CHERB     1       M      0.404762
                  feM    0.976744
          2       M      0.200000
                  feM    1.000000
          3       M      0.232558
                  feM    0.652174
QUEEN     1       M      0.000000
                  feM    1.000000
          2       M      0.000000
                  feM    1.000000
          3       M      0.076923
                  feM    0.727273
SOUTH     1       M      0.354430
                  feM    0.958333
          2       M      0.154639
                  feM    0.910448
          3       M      0.128302
                  feM    0.375000
dtype: float64

In [105]:
df_titanic

Unnamed: 0,Embarked,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname,TicketNum,Title
0,SOUTH,1,0,3,"Braund, Mr. Owen Harris",M,22.0,1,0,A/5 21171,7.2500,,Young,Low,BRAUND,5,Mr
1,CHERB,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",feM,38.0,1,0,PC 17599,71.2833,C85,Adult,Luxury,CUMINGS,17599,Mrs
2,SOUTH,3,1,3,"Heikkinen, Miss. Laina",feM,26.0,0,0,STON/O2. 3101282,7.9250,,Young,Medium,HEIKKINEN,2,Miss
3,SOUTH,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",feM,35.0,1,0,113803,53.1000,C123,Young,Luxury,FUTRELLE,113803,Mrs
4,SOUTH,5,0,3,"Allen, Mr. William Henry",M,35.0,0,0,373450,8.0500,,Young,Medium,ALLEN,373450,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,SOUTH,887,0,2,"Montvila, Rev. Juozas",M,27.0,0,0,211536,13.0000,,Young,Medium,MONTVILA,211536,Rev
885,SOUTH,888,1,1,"Graham, Miss. Margaret Edith",feM,19.0,0,0,112053,30.0000,B42,Young,High,GRAHAM,112053,Miss
886,SOUTH,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",feM,,1,2,W./C. 6607,23.4500,,,High,JOHNSTON,6607,Miss
887,CHERB,890,1,1,"Behr, Mr. Karl Howell",M,26.0,0,0,111369,30.0000,C148,Young,High,BEHR,111369,Mr


In [106]:
reset = titanic_mi.reset_index()
reset

Unnamed: 0,Pclass,Embarked,Sex,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname,TicketNum,Title
0,1,CHERB,M,31,0,"Uruchurtu, Don. Manuel E",40.0,0,0,PC 17601,27.7208,,Adult,High,URUCHURTU,17601,Don
1,1,CHERB,M,35,0,"Meyer, Mr. Edgar Joseph",28.0,1,0,PC 17604,82.1708,,Young,Luxury,MEYER,17604,Mr
2,1,CHERB,M,55,0,"Ostby, Mr. Engelhart Cornelius",65.0,0,1,113509,61.9792,B30,Senior,Luxury,OSTBY,113509,Mr
3,1,CHERB,M,65,0,"Stewart, Mr. Albert A",,0,0,PC 17605,27.7208,,,High,STEWART,17605,Mr
4,1,CHERB,M,97,0,"Goldschmidt, Mr. George B",71.0,0,0,PC 17754,34.6542,A5,Senior,Luxury,GOLDSCHMIDT,17754,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,3,SOUTH,feM,824,1,"Moor, Mrs. (Beila)",27.0,0,1,392096,12.4750,E121,Young,Medium,MOOR,392096,Mrs
885,3,SOUTH,feM,856,1,"Aks, Mrs. Sam (Leah Rosen)",18.0,0,1,392091,9.3500,,Child,Medium,AKS,392091,Mrs
886,3,SOUTH,feM,864,0,"Sage, Miss. Dorothy Edith ""Dolly""",,8,2,CA. 2343,69.5500,,,Luxury,SAGE,2343,Miss
887,3,SOUTH,feM,883,0,"Dahlberg, Miss. Gerda Ulrika",22.0,0,0,7552,10.5167,,Young,Medium,DAHLBERG,7552,Miss


In [107]:
reset.index.names = ["keys"]
reset.columns.names = ["col"]
reset

col,Pclass,Embarked,Sex,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname,TicketNum,Title
keys,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,CHERB,M,31,0,"Uruchurtu, Don. Manuel E",40.0,0,0,PC 17601,27.7208,,Adult,High,URUCHURTU,17601,Don
1,1,CHERB,M,35,0,"Meyer, Mr. Edgar Joseph",28.0,1,0,PC 17604,82.1708,,Young,Luxury,MEYER,17604,Mr
2,1,CHERB,M,55,0,"Ostby, Mr. Engelhart Cornelius",65.0,0,1,113509,61.9792,B30,Senior,Luxury,OSTBY,113509,Mr
3,1,CHERB,M,65,0,"Stewart, Mr. Albert A",,0,0,PC 17605,27.7208,,,High,STEWART,17605,Mr
4,1,CHERB,M,97,0,"Goldschmidt, Mr. George B",71.0,0,0,PC 17754,34.6542,A5,Senior,Luxury,GOLDSCHMIDT,17754,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,3,SOUTH,feM,824,1,"Moor, Mrs. (Beila)",27.0,0,1,392096,12.4750,E121,Young,Medium,MOOR,392096,Mrs
885,3,SOUTH,feM,856,1,"Aks, Mrs. Sam (Leah Rosen)",18.0,0,1,392091,9.3500,,Child,Medium,AKS,392091,Mrs
886,3,SOUTH,feM,864,0,"Sage, Miss. Dorothy Edith ""Dolly""",,8,2,CA. 2343,69.5500,,,Luxury,SAGE,2343,Miss
887,3,SOUTH,feM,883,0,"Dahlberg, Miss. Gerda Ulrika",22.0,0,0,7552,10.5167,,Young,Medium,DAHLBERG,7552,Miss


In [108]:
reset.reset_index()

col,keys,Pclass,Embarked,Sex,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,AgeGroup,FareBin,fname,TicketNum,Title
0,0,1,CHERB,M,31,0,"Uruchurtu, Don. Manuel E",40.0,0,0,PC 17601,27.7208,,Adult,High,URUCHURTU,17601,Don
1,1,1,CHERB,M,35,0,"Meyer, Mr. Edgar Joseph",28.0,1,0,PC 17604,82.1708,,Young,Luxury,MEYER,17604,Mr
2,2,1,CHERB,M,55,0,"Ostby, Mr. Engelhart Cornelius",65.0,0,1,113509,61.9792,B30,Senior,Luxury,OSTBY,113509,Mr
3,3,1,CHERB,M,65,0,"Stewart, Mr. Albert A",,0,0,PC 17605,27.7208,,,High,STEWART,17605,Mr
4,4,1,CHERB,M,97,0,"Goldschmidt, Mr. George B",71.0,0,0,PC 17754,34.6542,A5,Senior,Luxury,GOLDSCHMIDT,17754,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,884,3,SOUTH,feM,824,1,"Moor, Mrs. (Beila)",27.0,0,1,392096,12.4750,E121,Young,Medium,MOOR,392096,Mrs
885,885,3,SOUTH,feM,856,1,"Aks, Mrs. Sam (Leah Rosen)",18.0,0,1,392091,9.3500,,Child,Medium,AKS,392091,Mrs
886,886,3,SOUTH,feM,864,0,"Sage, Miss. Dorothy Edith ""Dolly""",,8,2,CA. 2343,69.5500,,,Luxury,SAGE,2343,Miss
887,887,3,SOUTH,feM,883,0,"Dahlberg, Miss. Gerda Ulrika",22.0,0,0,7552,10.5167,,Young,Medium,DAHLBERG,7552,Miss
