# Data Cleaning with Pandas
&copy; QuantSprout

Import pandas

In [1]:
import pandas as pd

Read the train.csv file as a DataFrame and investigate the contents (using head, info and describe)

In [2]:
train = pd.read_csv('train.csv') # ton of read functions! sql, sas, stata, spss etc

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Calculate average age

In [5]:
avg_age = train.Age.mean()
print(avg_age)

29.69911764705882


Create a new column called `'Age_clean'` using `.fillna()` to replace null ages with the average age

In [6]:
# instead of overwriting, Age column + imputed values for na's using average age

train['Age_clean'] = train.Age.fillna(avg_age)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Age_clean    891 non-null    float64
dtypes: float64(3), int64(5), object(5)
memory usage: 90.6+ KB


Convert Sex into a boolean variable as a new columned called `"Is_female"`

In [8]:
train.Sex == 'female'

0      False
1       True
2       True
3       True
4      False
       ...  
886    False
887     True
888     True
889    False
890    False
Name: Sex, Length: 891, dtype: bool

In [9]:
train['Is_female'] = train.Sex == 'female'

In [10]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,True
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,False


Investigate the unique values in the Embarked column

In [11]:
train.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

Create dummy values for Embarked

In [12]:
embarked = pd.get_dummies(train.Embarked, prefix='Emb') # important: index remains unchanged

In [13]:
embarked # way to dummify data for ml algorithms 

Unnamed: 0,Emb_C,Emb_Q,Emb_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


Merge the dummy data with the original data set

In [14]:
# merging original data with dummified data
train = pd.merge(train, embarked, left_index = True, right_index = True)

In [15]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,False,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,True,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,True,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,True,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,False,0,0,1


In [16]:
# using on operator
embarked_labels = pd.DataFrame({'Port': ['Cherbourg', 'Queenstown', 'Southampton'], 'Embarked': ['C', 'Q', 'S']})
embarked_labels

Unnamed: 0,Port,Embarked
0,Cherbourg,C
1,Queenstown,Q
2,Southampton,S


In [17]:
pd.merge(train, embarked_labels, left_on='Embarked', right_on='Embarked')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S,Port
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,22.000000,False,0,0,1,Southampton
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,26.000000,True,0,0,1,Southampton
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.000000,True,0,0,1,Southampton
3,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,35.000000,False,0,0,1,Southampton
4,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,54.000000,False,0,0,1,Southampton
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.7500,,Q,29.699118,False,0,1,0,Queenstown
885,826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.9500,,Q,29.699118,False,0,1,0,Queenstown
886,829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.7500,,Q,29.699118,False,0,1,0,Queenstown
887,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q,39.000000,True,0,1,0,Queenstown


In [18]:
pd.merge(train, embarked_labels, on='Embarked') # more succinct

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S,Port
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,22.000000,False,0,0,1,Southampton
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,26.000000,True,0,0,1,Southampton
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.000000,True,0,0,1,Southampton
3,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,35.000000,False,0,0,1,Southampton
4,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,54.000000,False,0,0,1,Southampton
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.7500,,Q,29.699118,False,0,1,0,Queenstown
885,826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.9500,,Q,29.699118,False,0,1,0,Queenstown
886,829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.7500,,Q,29.699118,False,0,1,0,Queenstown
887,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q,39.000000,True,0,1,0,Queenstown


### Exercise #1 - Clean Test Data

Read the test.csv file as a DataFrame and investigate the contents

In [19]:
test = pd.read_csv('test.csv')

In [20]:
test.info()
test.head()

# very similar, only missing survived column, eventually to be predicted

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Create a new column called `'Age_clean'` using `.fillna()` to replace null ages with the average age from the train.csv data

In [21]:
test['Age_clean'] = test.Age.fillna(avg_age)

In [22]:
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Age_clean    418 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 39.3+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,34.5
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,47.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,62.0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,27.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,22.0


Fill in missing fares with the average Fare of the training data

In [23]:
# hint: use inplace=True
test.Fare.fillna(train.Fare.mean(), inplace=True)

In [24]:
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Age_clean    418 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 39.3+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,34.5
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,47.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,62.0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,27.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,22.0


Convert Sex into a boolean variable as a new columned called `"Is_female"`

In [25]:
test['Is_female'] = test.Sex == 'female'

In [26]:
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Age_clean    418 non-null    float64
 12  Is_female    418 non-null    bool   
dtypes: bool(1), float64(3), int64(4), object(5)
memory usage: 39.7+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,34.5,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,47.0,True
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,62.0,False
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,27.0,False
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,22.0,True


Convert Embarked into dummy variables and merge the results with the original data

In [27]:
embarked = pd.get_dummies(train.Embarked, prefix='Emb') # important: index remains unchanged
embarked

Unnamed: 0,Emb_C,Emb_Q,Emb_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [28]:
# merging original data with dummified data
test = pd.merge(test, embarked, left_index = True, right_index = True)
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,34.5,False,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,47.0,True,1,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,62.0,False,0,0,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,27.0,False,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,22.0,True,0,0,1


### More Data Cleaning for the Training Data

Convert fares from GBP to USD as a new column called `Fare_usd`

In [29]:
train['Fare_usd'] = train.Fare * 1.3

In [30]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S,Fare_usd
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,False,0,0,1,9.425
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,True,1,0,0,92.66829
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,True,0,0,1,10.3025
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,True,0,0,1,69.03
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,False,0,0,1,10.465


Add the Sibsp and Parch columns into a new column called `Family_size`

In [31]:
# can't use dot notation to declare a new column

train['Family_size'] = train.SibSp + train.Parch

In [32]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S,Fare_usd,Family_size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,False,0,0,1,9.425,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,True,1,0,0,92.66829,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,True,0,0,1,10.3025,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,True,0,0,1,69.03,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,False,0,0,1,10.465,0


Extract the title from a name

In [33]:
name = train.Name[0]
name

'Braund, Mr. Owen Harris'

In [34]:
# string methods 
name_list = name.split(', ')
name_list

['Braund', 'Mr. Owen Harris']

In [35]:
# indexing list, to isolate 'title + first name' from 'last name'
title_and_first_name = name_list[1]
title_and_first_name

'Mr. Owen Harris'

In [36]:
title_list = title_and_first_name.split('. ')
title_list

['Mr', 'Owen Harris']

In [37]:
title = title_list[0]
title

'Mr'

Create a function that extracts the name from a title

In [38]:
# defining functions
def extract_title(name):
    title_and_first = name.split(', ')[1]
    title = title_and_first.split('. ')[0]
    return title

In [39]:
name = train.Name[30]

In [40]:
extract_title(name)

'Don'

Apply a function to a column of a DataFrame and store the results as a new column called `Title`

In [41]:
train.Name.apply(extract_title)

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Name, Length: 891, dtype: object

In [42]:
train['Title'] = train.Name.apply(extract_title)

In [43]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S,Fare_usd,Family_size,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,False,0,0,1,9.425,1,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,True,1,0,0,92.66829,1,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,True,0,0,1,10.3025,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,True,0,0,1,69.03,1,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,False,0,0,1,10.465,0,Mr


In [44]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Age_clean    891 non-null    float64
 13  Is_female    891 non-null    bool   
 14  Emb_C        891 non-null    uint8  
 15  Emb_Q        891 non-null    uint8  
 16  Emb_S        891 non-null    uint8  
 17  Fare_usd     891 non-null    float64
 18  Family_size  891 non-null    int64  
 19  Title   

Inspect the frequencies of each title 

In [45]:
# unique frequency counts by title
# train.Title.value_counts(normalize=True, dropna=False)
train.Title.value_counts()

# could potentially use title to predict survival rate 
# in ml algorithm

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
Lady              1
Don               1
Sir               1
Jonkheer          1
Mme               1
Capt              1
Ms                1
the Countess      1
Name: Title, dtype: int64

Create dummy variables for most common titles (Mr, Mrs, Master, Miss)

In [46]:
# creating dummies for most frequent categories
title_dummies = pd.get_dummies(train.Title)
title_dummies

Unnamed: 0,Capt,Col,Don,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
887,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
888,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
889,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [47]:
# subsetting to 4 relevant columns
title_dummies = title_dummies[['Mr', 'Mrs', 'Miss', 'Master']]
title_dummies

Unnamed: 0,Mr,Mrs,Miss,Master
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
886,0,0,0,0
887,0,0,1,0
888,0,0,1,0
889,1,0,0,0


Merge only the dummy variables for the most frequent titles to the train DataFrame

In [48]:
# a lot like a sql query: studying sql will improve joins and merges
train = pd.merge(train, title_dummies, left_index=True, right_index=True)

Save your clean data as a csv file (remember to set `index=False`)

In [49]:
# don't save your csv files as excel, it makes things very complicated
train.to_csv('clean_train.csv', index=False)

### Exercise #2 - More Test Data Cleaning

Add the Sibsp and Parch columns into a new column called `Family_size`

In [50]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,34.5,False,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,47.0,True,1,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,62.0,False,0,0,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,27.0,False,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,22.0,True,0,0,1


In [51]:
test['Family_size'] = test.SibSp + test.Parch

In [52]:
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Age_clean    418 non-null    float64
 12  Is_female    418 non-null    bool   
 13  Emb_C        418 non-null    uint8  
 14  Emb_Q        418 non-null    uint8  
 15  Emb_S        418 non-null    uint8  
 16  Family_size  418 non-null    int64  
dtypes: bool(1), float64(3), int64(5), object(5), uint8(3)
memory usage: 47.4+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S,Family_size
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,34.5,False,0,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,47.0,True,1,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,62.0,False,0,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,27.0,False,0,0,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,22.0,True,0,0,1,2


Extract the titles from the Names and store them in a new column called `Title` (apply the function we already created)

In [53]:
test['Title'] = test.Name.apply(extract_title)

In [54]:
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Age_clean    418 non-null    float64
 12  Is_female    418 non-null    bool   
 13  Emb_C        418 non-null    uint8  
 14  Emb_Q        418 non-null    uint8  
 15  Emb_S        418 non-null    uint8  
 16  Family_size  418 non-null    int64  
 17  Title        418 non-null    object 
dtypes: bool(1), float64(3), int64(5), object(6), uint8

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Is_female,Emb_C,Emb_Q,Emb_S,Family_size,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,34.5,False,0,0,1,0,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,47.0,True,1,0,0,1,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,62.0,False,0,0,1,0,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,27.0,False,0,0,1,0,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,22.0,True,0,0,1,2,Mrs


Convert the Titles into dummy variables and merge the most common titles to the test DataFrame

In [55]:
# creating dummies for most frequent categories
title_dummies = pd.get_dummies(test.Title)
title_dummies

Unnamed: 0,Col,Dona,Dr,Master,Miss,Mr,Mrs,Ms,Rev
0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
413,0,0,0,0,0,1,0,0,0
414,0,1,0,0,0,0,0,0,0
415,0,0,0,0,0,1,0,0,0
416,0,0,0,0,0,1,0,0,0


In [56]:
# subsetting to 4 relevant columns
title_dummies = title_dummies[['Mr', 'Mrs', 'Miss', 'Master']]
title_dummies

Unnamed: 0,Mr,Mrs,Miss,Master
0,1,0,0,0
1,0,1,0,0
2,1,0,0,0
3,1,0,0,0
4,0,1,0,0
...,...,...,...,...
413,1,0,0,0
414,0,0,0,0
415,1,0,0,0
416,1,0,0,0


Save test data as `clean_test.csv`

In [57]:
# a lot like a sql query: studying sql will improve joins and merges
test = pd.merge(test, title_dummies, left_index=True, right_index=True)

# rerunning will add duplicates

In [58]:
# don't save your csv files as excel, it makes things very complicated
test.to_csv('clean_test.csv', index=False)