In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("../titanic/train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
selected_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch']

In [6]:
data = df.loc[:, selected_cols]

In [7]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 41.8+ KB


In [9]:
data.Age.fillna(np.mean(data.Age), inplace=True)

In [10]:
le = LabelEncoder()

In [11]:
data["Sex"] = le.fit_transform(data.Sex)

In [12]:
# le.classes_

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int64(5)
memory usage: 41.8 KB


In [14]:
data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch'], dtype='object')

In [15]:
X = data.loc[:, ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]
y = data['Survived']

In [16]:
def entropy(column):
    vals, counts = np.unique(column, return_counts=True)
    total = len(column)
    
    acc = 0
    for count in counts:
        acc += (count/total)*np.log2(count/total)

    return -acc

In [17]:
entropy(y)

0.9607079018756469

In [18]:
def info_gain(X, y, label):
    
    pivot = np.mean(X[label])
    
    y_left = y[X[label] < pivot]
    y_right = y[X[label] >= pivot]
    
    if (len(y_left) == 0) or (len(y_right) == 0):
        return -1000
    
    left = (len(y_left)/len(y))*entropy(y_left)
    right = (len(y_right)/len(y))*entropy(y_right)
    
    return entropy(y) - left - right

In [19]:
for label in X.columns:
    print(label, info_gain(X, y, label))

Pclass 0.07579362743608165
Sex 0.2176601066606143
Age 0.001158644038169343
SibSp 0.009584541813400127
Parch 0.015380754493137666


In [20]:
class Node:
    def __init__(self, label=None, value=None, result=None):
        self.label = label
        self.value = value
        self.result = result


In [21]:
class DecisionTree:
    
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        
    def fit(self, X, y):
        self.root = self.generate(X, y, self.max_depth)
        
    def generate(self, X, y, depth):
        if depth == 1:
            return Node(result=np.mean(y))
        
        gains = []
        for label in X.columns:
            gain = info_gain(X, y, label)
            gains.append((gain, label))
  
        selected_label = max(gains)[1]
    
        pivot = np.mean(X[selected_label])
        left = (X[selected_label] < pivot)
        right = (X[selected_label] >= pivot)
        X_left, y_left = X[left], y[left]
        X_right, y_right = X[right], y[right]

        node = Node(selected_label, pivot)
        node.left = self.generate(X_left, y_left, depth-1)
        node.right = self.generate(X_right, y_right, depth-1)
        
        return node
    
    def display(self, node, indent=0):
        if node.label == None:
            if node.result < .5:
                print("\t"*indent, "Died :(")
            else:
                print("\t"*indent, "Survided :)))))")
            return
        
        print(indent*"\t", node.label, node.value)
        self.display(node.left, indent+1)
        self.display(node.right, indent+1)
        
        
    def predict_point(self, row, node):
        if node.label == None:
            return node.result
        
        if row[node.label] < node.value:
            return self.predict_point(row, node.left)
        else:
            return self.predict_point(row, node.right)
        
    def predict(self, X):
        y = []
        for index, row in X.iterrows():
            res = int(self.predict_point(row, self.root) > .5)
            y.append(res)
        
        return np.array(y)
    
    def score(self, X, y):
        yp = self.predict(X)
        
        return np.sum(y.values == yp) / len(y)
    
            
            
        

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [35]:
model = DecisionTree(8)
model.fit(X_train, y_train)

In [36]:
# model.display(model.root)

In [37]:
# model.root.left.label

In [38]:
# model.predict(X.loc[:20])


In [39]:
model.score(X_test, y_test)

0.8033898305084746

In [28]:
model.score(X_train, y_train)

0.9228187919463087

In [29]:
sum(y == 0) / len(y)

0.6161616161616161

In [109]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [100]:
model = DecisionTreeClassifier?

In [None]:
model = DecisionTreeClassifier

In [105]:
model = DecisionTreeClassifier(max_depth=3, criterion="entropy")

In [106]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [107]:
model.score(X_test, y_test)

0.8101694915254237

In [108]:
model.score(X_train, y_train)

0.8104026845637584

In [92]:
model = RandomForestClassifier(n_estimators=40, max_depth=3, max_features=5, random_state=0)

In [93]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [94]:
model.score(X_test, y_test)

0.8203389830508474

In [95]:
model.score(X_train, y_train)

0.8204697986577181

In [110]:
model = AdaBoostClassifier?

In [111]:
model = AdaBoostClassifier(n_estimators=50)

In [113]:
model.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [114]:
model.score(X_test, y_test)

0.8169491525423729