In [100]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd

In [101]:
data = pd.read_csv("../datasets/titanic/train.csv")

In [102]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [103]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [104]:
mod = data.drop(['PassengerId', 'Name','Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)

In [105]:
np.unique(mod.Sex)

array(['female', 'male'], dtype=object)

In [106]:
from sklearn.preprocessing import LabelEncoder

In [107]:
le = LabelEncoder()

In [108]:
mod["Sex"] = le.fit_transform(mod["Sex"])

In [109]:
study_data = mod.dropna()

In [110]:
study_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
count,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,0.634454,29.699118,0.512605,0.431373
std,0.49146,0.83825,0.481921,14.526497,0.929783,0.853289
min,0.0,1.0,0.0,0.42,0.0,0.0
25%,0.0,1.0,0.0,20.125,0.0,0.0
50%,0.0,2.0,1.0,28.0,0.0,0.0
75%,1.0,3.0,1.0,38.0,1.0,1.0
max,1.0,3.0,1.0,80.0,5.0,6.0


In [111]:
table = study_data.groupby(["Sex", "Pclass"]).mean()[["Age"]].reset_index()

In [122]:
def fill_age(row):
    if not row.notna()["Age"]:
        row["Age"] = table.loc[((table['Sex'] == row["Sex"]) & (table['Pclass'] == row["Pclass"]))]["Age"]  
    return row

In [123]:
mod = mod.apply(fill_age, axis=1)

In [124]:
mod.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.318643,0.523008,0.381594
std,0.486592,0.836071,0.47799,13.281103,1.102743,0.806057
min,0.0,1.0,0.0,0.42,0.0,0.0
25%,0.0,2.0,0.0,21.75,0.0,0.0
50%,0.0,3.0,1.0,26.507589,0.0,0.0
75%,1.0,3.0,1.0,36.0,1.0,0.0
max,1.0,3.0,1.0,80.0,8.0,6.0


In [131]:
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

In [126]:
X, y = mod.drop(["Survived"], axis=1), mod[["Survived"]]

In [130]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [136]:
model = DecisionTreeClassifier(criterion="entropy", max_depth=10)

In [137]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [138]:
model.score(X_test, y_test)

0.7762711864406779

In [139]:
def entropy(col):
    items, counts = np.unique(col, return_counts=True)
    probs = counts / np.sum(counts)
    return -np.sum(probs * np.log10(probs))
   

In [140]:
entropy(y)

0.2892018955359784

In [150]:
def info_gain(X, y, label):
    
    pivot = np.mean(X[label])
    y_left = y.loc[X[label] < pivot]
    y_right = y.loc[X[label] >= pivot]
    
    e_left = entropy(y_left)
    e_right = entropy(y_right)
    
    p_left = len(y_left)/len(y)
    p_right = len(y_right)/len(y)
    
    gain = entropy(y) - p_left*e_left - p_right*e_right
    return gain
    

In [151]:
for col in X.columns:
    print(col, info_gain(X, y, col), sep=":")

Pclass:0.022816155338440947
Sex:0.06552222096426633
Age:0.0003328207876600875
SibSp:0.002885234580529028
Parch:0.004630068458377962
