# Feature Engineering: Clearning

* <b>PassengerID</b>: random ID assigned to passenger
* <b>Pclass</b>: Ticket class of passenger (i.e, 1st class, 2nd class, and 3rd class)
* <b>Name</b>: Name of the passenger
* <b>Sex</b>: Male or Female
* <b>Age</b>: Age of passenger
* <b>SibSp</b>: Number of siblings accompanying passenger
* <b>Parch</b>: Number of parents and children accompanying passenger
* <b>Ticket</b>: Ticket number of passenger
* <b>Fare</b>: Fare paid for ticket
* <b>Cabin</b>: Cabin number of passenger
* <b>Embarked</b>: Port from which passenger embarked
* <b>Survived</b>: 0 and 1 for died and survived, respectively

In [33]:
# Import the libraries
import pandas as pd
import numpy as np

In [34]:
# Read the data into DataFrame
titanicDf = pd.read_csv('./titanic.csv')

In [35]:
# Preview the data
titanicDf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
# Drop unnecessary features
irrelevantFeats = ['PassengerId', 'Name', 'Ticket']
titanicDf.drop( irrelevantFeats, axis=1, inplace=True )
titanicDf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [37]:
# Add new feature where missing cabin is 0 and 1 otherwise
titanicDf['cabin_ind'] = np.where( titanicDf['Cabin'].isnull(), 0, 1 )
titanicDf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,cabin_ind
0,0,3,male,22.0,1,0,7.25,,S,0
1,1,1,female,38.0,1,0,71.2833,C85,C,1
2,1,3,female,26.0,0,0,7.925,,S,0
3,1,1,female,35.0,1,0,53.1,C123,S,1
4,0,3,male,35.0,0,0,8.05,,S,0


In [38]:
# Remove 'Cabin' after adding 'cabin_ind'
titanicDf.drop(['Cabin'], axis=1, inplace=True)
titanicDf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_ind
0,0,3,male,22.0,1,0,7.25,S,0
1,1,1,female,38.0,1,0,71.2833,C,1
2,1,3,female,26.0,0,0,7.925,S,0
3,1,1,female,35.0,1,0,53.1,S,1
4,0,3,male,35.0,0,0,8.05,S,0


In [39]:
# Convert 'male' and 'female' to 0 and 1, respectively
sexFlag = {
    'male': 0,
    'female': 1
}

titanicDf['Sex'] = titanicDf['Sex'].map( sexFlag )
titanicDf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_ind
0,0,3,0,22.0,1,0,7.25,S,0
1,1,1,1,38.0,1,0,71.2833,C,1
2,1,3,1,26.0,0,0,7.925,S,0
3,1,1,1,35.0,1,0,53.1,S,1
4,0,3,0,35.0,0,0,8.05,S,0


In [40]:
# Convert 'S', 'Q', and 'C' to 1,2, and 3, respectively
embarkedFlag = {
    'S': 1,
    'Q': 2,
    'C': 3
}

titanicDf['Embarked'] = titanicDf['Embarked'].map( embarkedFlag )
titanicDf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_ind
0,0,3,0,22.0,1,0,7.25,1.0,0
1,1,1,1,38.0,1,0,71.2833,3.0,1
2,1,3,1,26.0,0,0,7.925,1.0,0
3,1,1,1,35.0,1,0,53.1,1.0,1
4,0,3,0,35.0,0,0,8.05,1.0,0


In [41]:
# Confirm changes
titanicDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Survived   891 non-null    int64  
 1   Pclass     891 non-null    int64  
 2   Sex        891 non-null    int64  
 3   Age        714 non-null    float64
 4   SibSp      891 non-null    int64  
 5   Parch      891 non-null    int64  
 6   Fare       891 non-null    float64
 7   Embarked   889 non-null    float64
 8   cabin_ind  891 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 62.8 KB


In [42]:
# Fill-in missing values for age with the mean average of age
meanAge = titanicDf['Age'].mean()
titanicDf['Age'].fillna( meanAge, inplace=True )
titanicDf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_ind
0,0,3,0,22.0,1,0,7.25,1.0,0
1,1,1,1,38.0,1,0,71.2833,3.0,1
2,1,3,1,26.0,0,0,7.925,1.0,0
3,1,1,1,35.0,1,0,53.1,1.0,1
4,0,3,0,35.0,0,0,8.05,1.0,0


In [43]:
# Confirm changes
titanicDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Survived   891 non-null    int64  
 1   Pclass     891 non-null    int64  
 2   Sex        891 non-null    int64  
 3   Age        891 non-null    float64
 4   SibSp      891 non-null    int64  
 5   Parch      891 non-null    int64  
 6   Fare       891 non-null    float64
 7   Embarked   889 non-null    float64
 8   cabin_ind  891 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 62.8 KB


In [44]:
# Get a copy of DataFrame without missing 'Embarked' values
condition = titanicDf['Embarked'].notnull()
titanicDf = titanicDf[ condition ]
titanicDf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_ind
0,0,3,0,22.0,1,0,7.25,1.0,0
1,1,1,1,38.0,1,0,71.2833,3.0,1
2,1,3,1,26.0,0,0,7.925,1.0,0
3,1,1,1,35.0,1,0,53.1,1.0,1
4,0,3,0,35.0,0,0,8.05,1.0,0


In [45]:
# Confirm changes
titanicDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Survived   889 non-null    int64  
 1   Pclass     889 non-null    int64  
 2   Sex        889 non-null    int64  
 3   Age        889 non-null    float64
 4   SibSp      889 non-null    int64  
 5   Parch      889 non-null    int64  
 6   Fare       889 non-null    float64
 7   Embarked   889 non-null    float64
 8   cabin_ind  889 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 69.5 KB


In [46]:
# Export to CSV file
titanicDf.to_csv('./titanic_cleaned.csv', index=False)