### Data Source: https://www.kaggle.com/c/titanic

In [1]:
import pandas as pd
from pandas import Series, DataFrame

from sklearn.linear_model import LogisticRegression

import numpy as np

# Read from .csv

In [2]:
train_data = pd.read_csv("train.csv", index_col=0)
test_data = pd.read_csv("test.csv", index_col=0)

In [3]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [5]:
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


# Convert Features to Numerical Categories

## Age

In [7]:
# Age to Age Category - Child (age < 15 y.o), Adult (15 <= age < 65 y.o), Senior (age >= 65 y.o) Categories

# Add new column to input categories (write in same dataframe)
def age_categorization(passenger_age):
    # Compare the age, otherwise leave the sex
    #print(passenger_age, type(passenger_age))
    
    if passenger_age < 16:
        return 1
    elif passenger_age >= 16 and passenger_age < 65:
        return 2
    elif passenger_age >= 65:
        return 3
    else: # Unknown
        return 0

In [8]:
train_data["Age Category"] = train_data['Age'].apply(age_categorization) 
train_data.head(n=10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,2
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,1
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,2
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,1


In [9]:
test_data["Age Category"] = test_data['Age'].apply(age_categorization)
test_data.head(n=10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,2
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,2
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,2
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2
897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,1
898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,2
899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,2
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,2
901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,2


## Gender

In [10]:
def sex_categorization(passenger_sex):
    # Compare the age, otherwise leave the sex
    #print(passenger_age, type(passenger_age))
    
    if passenger_sex == "male":
        return 1
    elif passenger_sex == "female":
        return 2
    else: # Unknown
        return 0

In [11]:
train_data["Sex"] = train_data['Sex'].apply(sex_categorization) 
train_data.head(n = 10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,C,2
3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,,S,2
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,S,2
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,2
6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,,Q,0
7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S,2
8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,S,1
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,0,2,347742,11.1333,,S,2
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,1,0,237736,30.0708,,C,1


In [12]:
test_data["Sex"] = test_data["Sex"].apply(sex_categorization)
test_data.head(n=10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,2
893,3,"Wilkes, Mrs. James (Ellen Needs)",2,47.0,1,0,363272,7.0,,S,2
894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,2
895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,2
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2,22.0,1,1,3101298,12.2875,,S,2
897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,,S,1
898,3,"Connolly, Miss. Kate",2,30.0,0,0,330972,7.6292,,Q,2
899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,,S,2
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",2,18.0,0,0,2657,7.2292,,C,2
901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,,S,2


## Alone

In [13]:
# Sibsp, Parch to Accompanied Categories (Yes and No)

def accomponied_categorization(passenger):
    age_category, sibsp, parch = passenger
    
    family_members = sibsp + parch
    
    if family_members > 0:
        return 'False'
    elif family_members == 0:
        return 'True'
    elif family_members == 0 and age_category == "Child": # Assuming the child is accompanied by a nanny, already included
        return "False"
    else:
        return "Unknown"

In [14]:
train_data["Alone"] = train_data[['Age Category', 'SibSp', 'Parch']].apply(accomponied_categorization, axis = 1) 
train_data.head(n=10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,2,False
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,C,2,False
3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,,S,2,True
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,S,2,False
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,2,True
6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,,Q,0,True
7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S,2,True
8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,S,1,False
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,0,2,347742,11.1333,,S,2,False
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,1,0,237736,30.0708,,C,1,False


In [15]:
test_data["Alone"] = test_data[['Age Category', 'SibSp', 'Parch']].apply(accomponied_categorization, axis = 1) 
test_data.head(n=10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,2,True
893,3,"Wilkes, Mrs. James (Ellen Needs)",2,47.0,1,0,363272,7.0,,S,2,False
894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,2,True
895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,2,True
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2,22.0,1,1,3101298,12.2875,,S,2,False
897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,,S,1,True
898,3,"Connolly, Miss. Kate",2,30.0,0,0,330972,7.6292,,Q,2,True
899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,,S,2,False
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",2,18.0,0,0,2657,7.2292,,C,2,True
901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,,S,2,False


In [16]:
def alone_categorization(alone_class):
    if alone_class == 'True':
        return 1
    elif alone_class == 'False':
        return 2
    else:
        return 0

In [17]:
train_data["Alone"] = train_data['Alone'].apply(alone_categorization) 
train_data.head(n=10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,2,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,C,2,2
3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,,S,2,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,S,2,2
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,2,1
6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,,Q,0,1
7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S,2,1
8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,S,1,2
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,0,2,347742,11.1333,,S,2,2
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,1,0,237736,30.0708,,C,1,2


In [18]:
test_data["Alone"] = test_data['Alone'].apply(alone_categorization) 
test_data.head(n=10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,2,1
893,3,"Wilkes, Mrs. James (Ellen Needs)",2,47.0,1,0,363272,7.0,,S,2,2
894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,2,1
895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,2,1
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2,22.0,1,1,3101298,12.2875,,S,2,2
897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,,S,1,1
898,3,"Connolly, Miss. Kate",2,30.0,0,0,330972,7.6292,,Q,2,1
899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,,S,2,2
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",2,18.0,0,0,2657,7.2292,,C,2,1
901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,,S,2,2


## Cabin

In [19]:
# NaN data type is float but cabin is string. Conflicting data types cause issue in processing. Need to convert all to string.
values = {'Cabin': '0'}

train_data.fillna(value=values, inplace=True)
test_data.fillna(value=values, inplace=True)

# Cabin data to Deck Position Conversion
def cabinToDeckConversion(cabin):
    #print(cabin, cabin[0], type(cabin))
    
    if cabin[0] == '0':
        # print("No data present")
        return 'Unknown'
    else:
        # print(cabin[0])
        return cabin[0]

In [20]:
train_data['Deck'] = train_data['Cabin'].apply(cabinToDeckConversion) 
train_data.head(n = 10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0,S,2,2,Unknown
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,C,2,2,C
3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,0,S,2,1,Unknown
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,S,2,2,C
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,S,2,1,Unknown
6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,0,Q,0,1,Unknown
7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S,2,1,E
8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,0,S,1,2,Unknown
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,0,2,347742,11.1333,0,S,2,2,Unknown
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,1,0,237736,30.0708,0,C,1,2,Unknown


In [21]:
test_data['Deck'] = test_data['Cabin'].apply(cabinToDeckConversion) 
test_data.head(n = 10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,0,Q,2,1,Unknown
893,3,"Wilkes, Mrs. James (Ellen Needs)",2,47.0,1,0,363272,7.0,0,S,2,2,Unknown
894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,0,Q,2,1,Unknown
895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,0,S,2,1,Unknown
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2,22.0,1,1,3101298,12.2875,0,S,2,2,Unknown
897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,0,S,1,1,Unknown
898,3,"Connolly, Miss. Kate",2,30.0,0,0,330972,7.6292,0,Q,2,1,Unknown
899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,0,S,2,2,Unknown
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",2,18.0,0,0,2657,7.2292,0,C,2,1,Unknown
901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,0,S,2,2,Unknown


In [22]:
def deck_numerical_categorization(deck_class):
    if deck_class == 'A':
        return 1
    elif deck_class == 'B':
        return 2
    elif deck_class == 'C':
        return 3
    elif deck_class == 'D':
        return 4
    elif deck_class == 'E':
        return 5
    elif deck_class == 'F':
        return 6
    elif deck_class == 'G':
        return 7
    else:  # Merge T with Unknown
        return 0

In [23]:
train_data["Deck"] = train_data['Deck'].apply(deck_numerical_categorization) 
train_data.head(n = 10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0,S,2,2,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,C,2,2,3
3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,0,S,2,1,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,S,2,2,3
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,S,2,1,0
6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,0,Q,0,1,0
7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S,2,1,5
8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,0,S,1,2,0
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,0,2,347742,11.1333,0,S,2,2,0
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,1,0,237736,30.0708,0,C,1,2,0


In [24]:
test_data["Deck"] = test_data['Deck'].apply(deck_numerical_categorization) 
test_data.head(n = 10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,0,Q,2,1,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",2,47.0,1,0,363272,7.0,0,S,2,2,0
894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,0,Q,2,1,0
895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,0,S,2,1,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2,22.0,1,1,3101298,12.2875,0,S,2,2,0
897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,0,S,1,1,0
898,3,"Connolly, Miss. Kate",2,30.0,0,0,330972,7.6292,0,Q,2,1,0
899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,0,S,2,2,0
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",2,18.0,0,0,2657,7.2292,0,C,2,1,0
901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,0,S,2,2,0


## Port Embarked from

In [25]:
def embarked_numerical_categorization(deck_class):
    if deck_class == 'S':
        return 1
    elif deck_class == 'C':
        return 2
    elif deck_class == 'Q':
        return 3
    else:  # Merge T with Unknown
        return 0

In [26]:
train_data["Embarked"] = train_data['Embarked'].apply(embarked_numerical_categorization) 
train_data.head(n = 20)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0,1,2,2,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,2,2,2,3
3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,0,1,2,1,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,1,2,2,3
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,1,2,1,0
6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,0,3,0,1,0
7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,1,2,1,5
8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,0,1,1,2,0
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,27.0,0,2,347742,11.1333,0,1,2,2,0
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",2,14.0,1,0,237736,30.0708,0,2,1,2,0


In [27]:
test_data["Embarked"] = test_data['Embarked'].apply(embarked_numerical_categorization) 
test_data.head(n = 20)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Category,Alone,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,0,3,2,1,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",2,47.0,1,0,363272,7.0,0,1,2,2,0
894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,0,3,2,1,0
895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,0,1,2,1,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2,22.0,1,1,3101298,12.2875,0,1,2,2,0
897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,0,1,1,1,0
898,3,"Connolly, Miss. Kate",2,30.0,0,0,330972,7.6292,0,3,2,1,0
899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,0,1,2,2,0
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",2,18.0,0,0,2657,7.2292,0,2,2,1,0
901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,0,1,2,2,0


# Machine Learning Implementation

In [28]:
X_train = train_data.loc[:, ['Age Category', 'Sex', 'Alone', 'Pclass', 'Embarked', 'Deck']]
X_train

Unnamed: 0_level_0,Age Category,Sex,Alone,Pclass,Embarked,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2,1,2,3,1,0
2,2,2,2,1,2,3
3,2,2,1,3,1,0
4,2,2,2,1,1,3
5,2,1,1,3,1,0
...,...,...,...,...,...,...
887,2,1,1,2,1,0
888,2,2,1,1,1,2
889,0,2,2,3,1,0
890,2,1,1,1,2,3


In [29]:
y_train = train_data['Survived']
y_train

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [30]:
X_test = test_data.loc[:, ['Age Category', 'Sex', 'Alone', 'Pclass', 'Embarked', 'Deck']]
X_test

Unnamed: 0_level_0,Age Category,Sex,Alone,Pclass,Embarked,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,2,1,1,3,3,0
893,2,2,2,3,1,0
894,2,1,1,2,3,0
895,2,1,1,3,1,0
896,2,2,2,3,1,0
...,...,...,...,...,...,...
1305,0,1,1,3,1,0
1306,2,2,1,1,2,3
1307,2,1,1,3,1,0
1308,0,1,1,3,1,0


## Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

log_regr = LogisticRegression(max_iter=200)
log_regr.fit(X_train, y_train)
print(f"Coefficients > \n{log_regr.coef_}\nIntercept > {log_regr.intercept_}")

Coefficients > 
[[-0.06232809  2.48536936  0.08893663 -0.78815223  0.27831793  0.17333148]]
Intercept > [-2.71515379]


In [32]:
y_pred = log_regr.predict(X_test)
print(f"y_pred >\n{y_pred}")

y_pred >
[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 1 0 1
 1 0 0 0 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1
 1 1 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 1 1 1 1 1 1 0 1 0 0 0]


## Artificial Neural Network (Multi-Layer Perceptron Classifier)

In [33]:
from sklearn.neural_network import MLPClassifier

ANN_object = MLPClassifier(max_iter=800)  # Appears to be ineffective computation wise.
ANN_object.fit(X_train, y_train)

MLPClassifier(max_iter=800)

In [34]:
y_pred_ANN = ANN_object.predict(X_test)
print(f"y_pred >\n{y_pred_ANN}")

y_pred >
[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1
 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 1 0 1 1 0 0 1 0 0 0]


# Create Submission .csv
Series with PassengerID, Survived

In [35]:
ls_X_test_PassengerId = (X_test.index).values
ls_X_test_PassengerId

array([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,  902,
        903,  904,  905,  906,  907,  908,  909,  910,  911,  912,  913,
        914,  915,  916,  917,  918,  919,  920,  921,  922,  923,  924,
        925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
        936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,
        947,  948,  949,  950,  951,  952,  953,  954,  955,  956,  957,
        958,  959,  960,  961,  962,  963,  964,  965,  966,  967,  968,
        969,  970,  971,  972,  973,  974,  975,  976,  977,  978,  979,
        980,  981,  982,  983,  984,  985,  986,  987,  988,  989,  990,
        991,  992,  993,  994,  995,  996,  997,  998,  999, 1000, 1001,
       1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012,
       1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
       1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034,
       1035, 1036, 1037, 1038, 1039, 1040, 1041, 10

In [36]:
ls_submission = []
for index, PassengerId in enumerate(ls_X_test_PassengerId):
    ls_submission.append([PassengerId, y_pred[index]])
    
ls_submission

[[892, 0],
 [893, 1],
 [894, 0],
 [895, 0],
 [896, 1],
 [897, 0],
 [898, 1],
 [899, 0],
 [900, 1],
 [901, 0],
 [902, 0],
 [903, 0],
 [904, 1],
 [905, 0],
 [906, 1],
 [907, 1],
 [908, 0],
 [909, 0],
 [910, 1],
 [911, 1],
 [912, 0],
 [913, 0],
 [914, 1],
 [915, 0],
 [916, 1],
 [917, 0],
 [918, 1],
 [919, 0],
 [920, 0],
 [921, 0],
 [922, 0],
 [923, 0],
 [924, 1],
 [925, 1],
 [926, 1],
 [927, 0],
 [928, 1],
 [929, 1],
 [930, 0],
 [931, 0],
 [932, 0],
 [933, 1],
 [934, 0],
 [935, 1],
 [936, 1],
 [937, 0],
 [938, 0],
 [939, 0],
 [940, 1],
 [941, 1],
 [942, 0],
 [943, 0],
 [944, 1],
 [945, 1],
 [946, 0],
 [947, 0],
 [948, 0],
 [949, 0],
 [950, 0],
 [951, 1],
 [952, 0],
 [953, 0],
 [954, 0],
 [955, 1],
 [956, 1],
 [957, 1],
 [958, 1],
 [959, 0],
 [960, 1],
 [961, 1],
 [962, 1],
 [963, 0],
 [964, 1],
 [965, 1],
 [966, 1],
 [967, 1],
 [968, 0],
 [969, 1],
 [970, 0],
 [971, 1],
 [972, 0],
 [973, 0],
 [974, 0],
 [975, 0],
 [976, 0],
 [977, 0],
 [978, 1],
 [979, 1],
 [980, 1],
 [981, 0],
 [982, 1],

In [37]:
df_submission = pd.DataFrame(ls_submission, columns=['PassengerId', 'Survived'])
df_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [38]:
df_submission.to_csv('submission_ekng.csv', index=False)