In [1]:
# load packages
import numpy as np
import pandas as pd

In [2]:
# get data
train_file = "data/train.csv"
test_file = "data/test.csv"

In [3]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [4]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
# select only important columns using domain knowledge
train = train.loc[:, [col for col in train.columns if col not in ["Name", "Ticket", "Cabin"]]]
test = test.loc[:, [col for col in test.columns if col not in ["Name", "Ticket", "Cabin"]]]

In [7]:
train.isnull().sum() # check for null values

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [8]:
test.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [9]:
# fill the missing values
train.Age.fillna(train.Age.mean(), inplace=True)
test.Age.fillna(test.Age.mean(), inplace=True)

In [10]:
test.Fare.fillna(test.Fare.median(), inplace=True)

In [11]:
train.Embarked.fillna("S", inplace=True)

In [12]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [13]:
# convert object columns to numeirc columns
train["EmbarkedCode"] = train.Embarked.astype("category").cat.codes
train["SexCode"] = train.Sex.astype("category").cat.codes

test["EmbarkedCode"] = test.Embarked.astype("category").cat.codes
test["SexCode"] = test.Sex.astype("category").cat.codes

In [14]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,EmbarkedCode,SexCode
0,1,0,3,male,22.0,1,0,7.25,S,2,1
1,2,1,1,female,38.0,1,0,71.2833,C,0,0
2,3,1,3,female,26.0,0,0,7.925,S,2,0
3,4,1,1,female,35.0,1,0,53.1,S,2,0
4,5,0,3,male,35.0,0,0,8.05,S,2,1


In [15]:
# get the mappings
train[["Sex", "SexCode"]].drop_duplicates()

Unnamed: 0,Sex,SexCode
0,male,1
1,female,0


In [16]:
train[["Embarked", "EmbarkedCode"]].drop_duplicates()

Unnamed: 0,Embarked,EmbarkedCode
0,S,2
1,C,0
5,Q,1


In [17]:
# remove duplicate columns
train = train.loc[:, [col for col in train.columns if col not in ["Sex", "Embarked"]]]
test = test.loc[:, [col for col in test.columns if col not in ["Sex", "Embarked"]]]

In [18]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,EmbarkedCode,SexCode
0,1,0,3,22.0,1,0,7.25,2,1
1,2,1,1,38.0,1,0,71.2833,0,0
2,3,1,3,26.0,0,0,7.925,2,0
3,4,1,1,35.0,1,0,53.1,2,0
4,5,0,3,35.0,0,0,8.05,2,1


In [19]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,EmbarkedCode,SexCode
0,892,3,34.5,0,0,7.8292,1,1
1,893,3,47.0,1,0,7.0,2,0
2,894,2,62.0,0,0,9.6875,1,1
3,895,3,27.0,0,0,8.6625,2,1
4,896,3,22.0,1,1,12.2875,2,0


In [20]:
train.isnull().sum()

PassengerId     0
Survived        0
Pclass          0
Age             0
SibSp           0
Parch           0
Fare            0
EmbarkedCode    0
SexCode         0
dtype: int64

In [21]:
test.isnull().sum()

PassengerId     0
Pclass          0
Age             0
SibSp           0
Parch           0
Fare            0
EmbarkedCode    0
SexCode         0
dtype: int64

In [22]:
# save the cleaned data
train.to_csv("data/train-cleaned.csv", index=False)
test.to_csv("data/test-cleaned.csv", index=False)