<a href="https://colab.research.google.com/github/datapaf/Kaggle/blob/master/titanix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Logistic Regression


## Preprocessing

1. Imputation
1. encode categorical features
2. scale features
3. dimension reduction (?)
4. correlating predictors

### Imputation

In [4]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

* use mean value for age
* encode cabin with NaN values (leave it for now)
* use most frequent value for embarked
* use most frequent value for fare

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
def impute(column, strategy):
    train_imputer = SimpleImputer(strategy=strategy)
    test_imputer = SimpleImputer(strategy=strategy)

    train_df[column] = train_imputer.fit_transform(train_df[[column]])
    test_df[column] = test_imputer.fit_transform(test_df[[column]])

In [8]:
# age imputation

impute(column="Age", strategy="mean")

train_df['Age'] = train_df['Age'].astype(int) 
test_df['Age'] = test_df['Age'].astype(int) 

In [9]:
# embark imputation

impute(column="Embarked", strategy="most_frequent")

In [10]:
# fare imputation

impute(column="Fare", strategy="most_frequent")

### Encoding

In [11]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0000,C148,C


* use OneHotEncoder for Sex
* use OrdinalEncoder for Ticket
* use OrdinalEncoder for Cabin
* use OneHotEncoder for Embarked

In [12]:
from sklearn.preprocessing import OrdinalEncoder

In [13]:
def ohe(df, column):
    new_features = pd.get_dummies(df[[column]])
    new_df = pd.concat([df, new_features], axis=1)
    return new_df.drop(column, axis=1)

In [20]:
def ord_enc(df, column, handle_nan=False):
    enc = OrdinalEncoder()
    df[[column]] = enc.fit_transform(df[[column]]).astype(int)

In [15]:
# gender encoding

train_df = ohe(train_df, "Sex")
test_df = ohe(test_df, "Sex")

In [16]:
# ticket encoding

ord_enc(train_df, "Ticket")
ord_enc(test_df, "Ticket")

In [18]:
# cabin encoding

ord_enc(train_df, "Cabin", handle_nan=True)
ord_enc(test_df, "Cabin", handle_nan=True)

In [21]:
# embarked encoding

train_df = ohe(train_df, "Embarked")
test_df = ohe(test_df, "Embarked")

In [22]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22,1,0,523,7.2500,-9223372036854775808,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,596,71.2833,81,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26,0,0,669,7.9250,-9223372036854775808,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,49,53.1000,55,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35,0,0,472,8.0500,-9223372036854775808,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27,0,0,101,13.0000,-9223372036854775808,0,1,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",19,0,0,14,30.0000,30,1,0,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",29,1,2,675,23.4500,-9223372036854775808,1,0,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",26,0,0,8,30.0000,60,0,1,1,0,0


### Scaling

In [27]:
predictors = ["Pclass", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S"]
target = "Survived"

In [30]:
from sklearn.preprocessing import StandardScaler

train_sc = StandardScaler()
train_df[predictors] = train_sc.fit_transform(train_df[predictors])

test_sc = StandardScaler()
test_df[predictors] = test_sc.fit_transform(test_df[predictors])

In [40]:
train_df.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,1.0,-0.005007,-0.035144,0.033741,-0.057527,-0.001652,-0.056554,0.012658,0.019919,-0.042939,0.042939,-0.001205,-0.033606,0.022204
Survived,-0.005007,1.0,-0.338481,-0.067809,-0.035322,0.081629,-0.164549,0.257307,0.316912,0.543351,-0.543351,0.16824,0.00365,-0.149683
Pclass,-0.035144,-0.338481,1.0,-0.335071,0.083081,0.018443,0.319869,-0.5495,-0.725541,-0.1319,0.1319,-0.243292,0.221009,0.074053
Age,0.033741,-0.067809,-0.335071,1.0,-0.232743,-0.176744,-0.068962,0.093856,0.236172,-0.082533,0.082533,0.030786,-0.021164,-0.013652
SibSp,-0.057527,-0.035322,0.083081,-0.232743,1.0,0.414838,0.079461,0.159651,-0.04046,0.114631,-0.114631,-0.059528,-0.026354,0.068734
Parch,-0.001652,0.081629,0.018443,-0.176744,0.414838,1.0,0.020003,0.216225,0.036987,0.245489,-0.245489,-0.011069,-0.081228,0.060814
Ticket,-0.056554,-0.164549,0.319869,-0.068962,0.079461,0.020003,1.0,-0.013885,-0.249303,-0.059372,0.059372,-0.021381,0.064396,-0.021794
Fare,0.012658,0.257307,-0.5495,0.093856,0.159651,0.216225,-0.013885,1.0,0.482075,0.182333,-0.182333,0.269335,-0.117216,-0.162184
Cabin,0.019919,0.316912,-0.725541,0.236172,-0.04046,0.036987,-0.249303,0.482075,1.0,0.140391,-0.140391,0.208528,-0.129572,-0.101139
Sex_female,-0.042939,0.543351,-0.1319,-0.082533,0.114631,0.245489,-0.059372,0.182333,0.140391,1.0,-1.0,0.082853,0.074115,-0.119224


we see some considerable correlation between:
* Fare vs. Pclass -> Pclass
* Cabin vs. Pclass -> Pclass
* Sex_male vs. Sex_female -> Sex_male
* Embarked_S vs. Embarked_C -> Embarked_C
* Embarked_S vs. Embarked_Q -> Embarked_Q

In [41]:
predictors = list(set(predictors) - set(["Fare", "Cabin", "Sex_female", "Embarked_S"]))

In [43]:
predictors

['Embarked_C',
 'Ticket',
 'Pclass',
 'SibSp',
 'Embarked_Q',
 'Age',
 'Sex_male',
 'Parch']