In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('train.csv')
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Remove Col:-['PassengerId','Name','Ticket','Cabin']

In [6]:
df.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)

In [7]:
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [8]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

# Fill NA Value Of Age & Embarked Col:-

In [9]:
df['Age']=df['Age'].fillna(df['Age'].mean())
df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode()[0])

In [10]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

# Apply OneHotEncoder on Embarked Col:-

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
##Use Sex Col:-
dummies_sex=pd.get_dummies(df['Sex'])
new_sex=dummies_sex.drop(['male'],axis=1).astype(int)
df['Sex']=new_sex
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [14]:
#Use Embarked Col:-
dummies_embarked=pd.get_dummies(df['Embarked'])
dummies_embarked

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [15]:
df=pd.concat([df,dummies_embarked],axis=1, join='inner')

In [17]:
df.drop(['Embarked'], axis=1,inplace=True)

In [20]:
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0


# Train test split

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived', axis=1),
                                                    df['Survived'],
                                                    test_size=0.3,
                                                    random_state=0)

X_train.shape, X_test.shape

((623, 9), (268, 9))

# Use StadardScaler

In [19]:
from sklearn.preprocessing import StandardScaler

In [24]:
scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
scaler.mean_

array([ 2.28731942,  0.34510433, 29.87334388,  0.53130016,  0.39325843,
       32.45827255,  0.17335474,  0.08988764,  0.73675762])

In [26]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [29]:
np.round(X_train.describe(), 1)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
count,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,2.3,0.3,29.9,0.5,0.4,32.5,0.2,0.1,0.7
std,0.8,0.5,13.0,1.2,0.8,48.3,0.4,0.3,0.4
min,1.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.5,0.0,23.0,0.0,0.0,7.9,0.0,0.0,0.0
50%,3.0,0.0,29.7,0.0,0.0,15.0,0.0,0.0,1.0
75%,3.0,1.0,36.0,1.0,0.0,31.4,0.0,0.0,1.0
max,3.0,1.0,80.0,8.0,6.0,512.3,1.0,1.0,1.0


In [30]:
np.round(X_train_scaled.describe(), 1)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
count,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.5,-0.7,-2.2,-0.5,-0.5,-0.7,-0.5,-0.3,-1.7
25%,-0.9,-0.7,-0.5,-0.5,-0.5,-0.5,-0.5,-0.3,-1.7
50%,0.8,-0.7,-0.0,-0.5,-0.5,-0.4,-0.5,-0.3,0.6
75%,0.8,1.4,0.5,0.4,-0.5,-0.0,-0.5,-0.3,0.6
max,0.8,1.4,3.9,6.4,6.7,10.0,2.2,3.2,0.6
