In [1]:
import pandas as pd
import numpy as np

In [2]:
titanic = pd.read_csv("titanic_train.csv")

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df = titanic.copy()

### Description of features in the dataset

![Titanic-Features-Description.png](attachment:Titanic-Features-Description.png)

Name feature doesn't seem to make a contribution as to whether the person survives or not, so can be deleted. Even the ticket number might not be relevant.

In [5]:
df.drop(["Name","Ticket"],axis=1,inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,7.25,,S
1,2,1,1,female,38.0,1,0,71.2833,C85,C
2,3,1,3,female,26.0,0,0,7.925,,S
3,4,1,1,female,35.0,1,0,53.1,C123,S
4,5,0,3,male,35.0,0,0,8.05,,S


Sex, and Embarked are two string based featues, which will be dropped in place of one hot encoded features

In [6]:
def male_ohe(g):
    if g == "male":
        return 1
    else:
        return 0

In [7]:
def female_ohe(g):
    if g == "female":
        return 1
    else:
        return 0

In [8]:
df["Male"] = df["Sex"].apply(male_ohe)

In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Male
0,1,0,3,male,22.0,1,0,7.25,,S,1
1,2,1,1,female,38.0,1,0,71.2833,C85,C,0
2,3,1,3,female,26.0,0,0,7.925,,S,0
3,4,1,1,female,35.0,1,0,53.1,C123,S,0
4,5,0,3,male,35.0,0,0,8.05,,S,1


In [10]:
df["Female"] = df["Sex"].apply(female_ohe)

In [11]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Male,Female
0,1,0,3,male,22.0,1,0,7.25,,S,1,0
1,2,1,1,female,38.0,1,0,71.2833,C85,C,0,1
2,3,1,3,female,26.0,0,0,7.925,,S,0,1
3,4,1,1,female,35.0,1,0,53.1,C123,S,0,1
4,5,0,3,male,35.0,0,0,8.05,,S,1,0


In [12]:
df.drop("Sex",axis=1,inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Male,Female
0,1,0,3,22.0,1,0,7.25,,S,1,0
1,2,1,1,38.0,1,0,71.2833,C85,C,0,1
2,3,1,3,26.0,0,0,7.925,,S,0,1
3,4,1,1,35.0,1,0,53.1,C123,S,0,1
4,5,0,3,35.0,0,0,8.05,,S,1,0


In [13]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Male,Female
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.647587,0.352413
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.47799,0.47799
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0,0.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0


In [14]:
df[df["Survived"]==1].describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Male,Female
count,342.0,342.0,342.0,290.0,342.0,342.0,342.0,342.0,342.0
mean,444.368421,1.0,1.950292,28.34369,0.473684,0.464912,48.395408,0.318713,0.681287
std,252.35884,0.0,0.863321,14.950952,0.708688,0.771712,66.596998,0.46666,0.46666
min,2.0,1.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,250.75,1.0,1.0,19.0,0.0,0.0,12.475,0.0,0.0
50%,439.5,1.0,2.0,28.0,0.0,0.0,26.0,0.0,1.0
75%,651.5,1.0,3.0,36.0,1.0,1.0,57.0,1.0,1.0
max,890.0,1.0,3.0,80.0,4.0,5.0,512.3292,1.0,1.0


In [15]:
df[df["Survived"]==0].describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Male,Female
count,549.0,549.0,549.0,424.0,549.0,549.0,549.0,549.0,549.0
mean,447.016393,0.0,2.531876,30.626179,0.553734,0.32969,22.117887,0.852459,0.147541
std,260.640469,0.0,0.735805,14.17211,1.288399,0.823166,31.388207,0.354968,0.354968
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,211.0,0.0,2.0,21.0,0.0,0.0,7.8542,1.0,0.0
50%,455.0,0.0,3.0,28.0,0.0,0.0,10.5,1.0,0.0
75%,675.0,0.0,3.0,39.0,1.0,0.0,26.0,1.0,0.0
max,891.0,0.0,3.0,74.0,8.0,6.0,263.0,1.0,1.0


In [16]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
Male             0
Female           0
dtype: int64

In [17]:
df["Age"] = df.groupby("Survived").transform(lambda x: x.fillna(x.mean()))
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Male,Female
0,1,0,3,1.0,1,0,7.25,,S,1,0
1,2,1,1,2.0,1,0,71.2833,C85,C,0,1
2,3,1,3,3.0,0,0,7.925,,S,0,1
3,4,1,1,4.0,1,0,53.1,C123,S,0,1
4,5,0,3,5.0,0,0,8.05,,S,1,0


In [18]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Age              0
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
Male             0
Female           0
dtype: int64

In [20]:
df.shape

(891, 11)

In [21]:
df.Cabin.fillna(0,inplace=True)

In [22]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Cabin          0
Embarked       2
Male           0
Female         0
dtype: int64

In [26]:
df.dropna(inplace=True)
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Cabin          0
Embarked       0
Male           0
Female         0
dtype: int64

In [27]:
df.shape

(889, 11)

In [28]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Male,Female
0,1,0,3,1.0,1,0,7.25,0,S,1,0
1,2,1,1,2.0,1,0,71.2833,C85,C,0,1
2,3,1,3,3.0,0,0,7.925,0,S,0,1
3,4,1,1,4.0,1,0,53.1,C123,S,0,1
4,5,0,3,5.0,0,0,8.05,0,S,1,0


In [30]:
df["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [31]:
def cherbourg(s):
    if s=="C":
        return 1
    else:
        return 0
    
def queenstown(s):
    if s=="Q":
        return 1
    else:
        return 0

def southamptom(s):
    if s=="S":
        return 1
    else:
        return 0

In [34]:
df["Cherbourg"] = df.Embarked.apply(cherbourg)
df["Queenstown"] = df.Embarked.apply(queenstown)
df["Southamptom"] = df.Embarked.apply(southamptom)
df.drop("Embarked",axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Male,Female,Cherbourg,Queenstown,Southamptom
0,1,0,3,1.0,1,0,7.25,0,1,0,0,0,1
1,2,1,1,2.0,1,0,71.2833,C85,0,1,1,0,0
2,3,1,3,3.0,0,0,7.925,0,0,1,0,0,1
3,4,1,1,4.0,1,0,53.1,C123,0,1,0,0,1
4,5,0,3,5.0,0,0,8.05,0,1,0,0,0,1


In [35]:
df.shape

(889, 13)

In [36]:
df.to_csv("Titanic_cleaned.csv")