In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("../datasets/titanic/train.csv")

In [3]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
cut = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch']].copy()

In [5]:
cut.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594
std,0.486592,0.836071,14.526497,1.102743,0.806057
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,38.0,1.0,0.0
max,1.0,3.0,80.0,8.0,6.0


In [6]:
cut.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 41.8+ KB


In [7]:
droped = cut.loc[cut["Age"].isna()]

In [8]:
cut.dropna(inplace=True)

In [9]:
board = cut.groupby(["Survived", "Pclass", "Sex"])[["Age"]].mean()
board.reset_index(inplace=True)

In [24]:
board

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,1,female,25.666667
1,0,1,male,44.581967
2,0,2,female,36.0
3,0,2,male,33.369048
4,0,3,female,23.818182
5,0,3,male,27.255814
6,1,1,female,34.939024
7,1,1,male,36.248
8,1,2,female,28.080882
9,1,2,male,16.022


In [10]:
def get_age(row):
    item = board.loc[(board["Survived"]==row["Survived"]) & (board["Pclass"]==row["Pclass"]) & (board["Sex"]==row["Sex"])]
    row["Age"] = item.iat[0,-1]
    return row

In [11]:
board.loc[(board["Survived"]==0) & (board["Pclass"]==1) & (board["Sex"]=="male")]

Unnamed: 0,Survived,Pclass,Sex,Age
1,0,1,male,44.581967


In [12]:
droped = droped.apply(get_age, axis=1)

In [13]:
total = pd.concat([cut, droped])

In [14]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 888
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 48.7+ KB


In [15]:
le = LabelEncoder()

In [16]:
total["Sex"] = le.fit_transform(total["Sex"])

In [17]:
le.classes_

array(['female', 'male'], dtype=object)

In [18]:
X = total.drop(["Survived"], axis=1)

In [19]:
y = total["Survived"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [21]:
model = DecisionTreeClassifier(max_depth=15)

In [22]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [23]:
model.score(X_test, y_test)

0.823728813559322

In [176]:
def entropy(col):
    vals, counts = np.unique(col, return_counts=True)
    
    acc = 0
    probs = counts / len(col)
    for prob in probs:
        acc += prob * np.log2(prob)
        
    return - acc

In [177]:
entropy(total["Survived"])

0.9607079018756469

In [180]:
def infogain(X, col_name, y):
    
    pivot = X[col_name].mean()
    left = y.loc[X[col_name] < pivot]
    right = y.loc[X[col_name] >= pivot]
    
    l_entropy = entropy(left)
    r_entropy = entropy(right)
    tot_entropy = entropy(y)
    p_left = len(left)/len(y)
    p_right = len(right)/len(y)
    
    return tot_entropy - p_left*l_entropy - p_right*r_entropy


In [181]:
for col in X.columns:
    print("ingogain for ", col, infogain(X, col, y))

ingogain for  Pclass 0.07579362743608165
ingogain for  Sex 0.2176601066606143
ingogain for  Age 0.0008052404301705507
ingogain for  SibSp 0.009584541813400127
ingogain for  Parch 0.015380754493137666
