In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data_temp/titanic.csv')
data['Label'] = data.Survived

In [3]:
data = data.drop(['PassengerId','Survived','Ticket','Cabin','Name'],axis=1)

In [4]:
mode_embarked = data.Embarked.mode()[0]
age_median = data.Age.median()
data = data.fillna({'Age':age_median,'Embarked':mode_embarked})

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
Label       891 non-null int64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [6]:
from sklearn.model_selection import train_test_split

In [55]:
train,test = train_test_split(data,test_size=0.2)

### Helper Functions

In [59]:
#Check Data purity
def purity_check(data):
    y = data[:,-1]
    if len(set(y)) == 1:
        return True
    else:
        return False

def classify(data):
    '''Returns the most frequent label occuring in the
        dependent variable data'''
    y = data[:,-1]
    unique_labels,label_counts = np.unique(y,return_counts=True)
    classification = unique_labels[np.argmax(label_counts)]
    return classification

def get_feature_types(df):
    n_unique_value_threshold = 5
    feature_types = []
    for index in range(df.shape[1]):
        datatype = str(df.iloc[:,index].dtype)
        cond1_for_num = 'float' in datatype or 'int' in datatype
        cond2_for_num = len(np.unique(df.iloc[:,index].values)) > n_unique_value_threshold
        if cond1_for_num and cond2_for_num:
            feature_type = 'numerical'
        else:
            feature_type = 'categorical'
        feature_types.append(feature_type)
    return feature_types

def get_potential_splits(data,feature_types):
    '''Return a dictionary of potential splits with keys 
        corresponding to columns of feature matrix and 
        values corresponding to pontial spilts in each 
        column''' 
    X = data[:,:-1] #Extracting the feature matrix
    potential_splits = {}
    for column_index in range(X.shape[1]):
        unique_values = np.unique(X[:,column_index])
        if feature_types[column_index] == 'numerical' and len(unique_values) > 1:
            potential_splits[column_index] = []
            for index in range(1,len(unique_values)): #Skiping the first index
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2  
                potential_splits[column_index].append(potential_split)
        elif feature_types[column_index] == 'categorical' and len(unique_values) > 1:
            potential_splits[column_index] = list(unique_values)
    return potential_splits

def split_data(data,split_col,split_value,feature_types):
    feature_type = feature_types[split_col]
    
    if feature_type == 'numerical':
        data_left = data[data[:,split_col] <= split_value]
        data_right = data[data[:,split_col] > split_value ]

    elif feature_type == 'categorical':        
        data_left = data[data[:,split_col] == split_value]
        data_right = data[data[:,split_col] != split_value]
    
    return data_left, data_right

def leaf_entropy(y):
    _,counts = np.unique(y,return_counts=True)
    probabilities = counts/counts.sum()
    entropy = np.sum(probabilities*-np.log2(probabilities))
    return entropy

def split_entropy(data_left,data_right):
    n = len(data_left) + len(data_right)
    p_left = len(data_left)/n
    p_right = len(data_right)/n
    entropy = p_left*leaf_entropy(data_left[:,-1]) + p_right*leaf_entropy(data_right[:,-1])
    return entropy

def determine_best_split(data,potential_splits,feature_types):
    entropy_best_split = 999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_left,data_right = split_data(data,column_index,value,feature_types)
            entropy = split_entropy(data_left,data_right)
            if entropy <= entropy_best_split:
                entropy_best_split = entropy
                best_split_col = column_index
                best_split_value = value
    return best_split_col,best_split_value

def decision_tree_classification(data,min_samples,max_depth,column_headers,feature_types,counter):
    potential_splits = get_potential_splits(data,feature_types)
    #Base case, if the data is pure
    if (purity_check(data)) or (len(data) < min_samples) or (counter == max_depth) or len(potential_splits) == 0:
        return classify(data)                                                         # if there are no potential splits,
                                                                                      # classify the data
    #Recrusive Part
    else:
        counter += 1
        #helper functions
        split_col,split_val = determine_best_split(data,potential_splits,feature_types)
        data_left, data_right = split_data(data,split_col,split_val,feature_types) #f
        
        col_name = column_headers[split_col]
        feat_type = feature_types[split_col]
        if feat_type == 'numerical':
            question = f'{col_name} <= {split_val}'
        else:
            question = f'{col_name} = {split_val}'
        subtree = {question: []}
        yes_ans = decision_tree_classification(data_left,min_samples,max_depth,column_headers,feature_types,counter)
        no_ans = decision_tree_classification(data_right,min_samples,max_depth,column_headers,feature_types,counter)
        
        if yes_ans == no_ans:
            subtree = yes_ans
        else:
            subtree[question].append(yes_ans)
            subtree[question].append(no_ans)

    return subtree

def single_predict(example,tree):
    question = list(tree.keys())[0]
    col_name,operator,value = question.split()
    if operator == '<=':
        if str(example[col_name]) <= value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:
         if str(example[col_name]) == value:
            answer = tree[question][0]
         else:
            answer = tree[question][1]
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return single_predict(example,residual_tree)

In [9]:
class DecisionTreeClassifier():
    def __init__(self,max_depth=None,min_samples=3):
        self.tree = None
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.column_headers = None
    
    def train(self,X,y):
        '''The feature matrix(X) must be a data frame, so that the column
        headers may be extracted'''
        column_headers = X.columns
        feature_types = get_feature_types(X)
        self.counter = 0
        X = X.values
        y = y.values.reshape(y.size,-1)
        data = np.hstack((X,y)) #Converting the dataframes into an array
        
        self.tree = decision_tree_classification(data,self.min_samples,self.max_depth,column_headers,feature_types,self.counter)
        return self.tree
    
    def predict(self,X):
        predictions = []
        for index in range(len(X)):
            example = X.iloc[index]
            prediction = single_predict(example,self.tree)
            predictions.append(prediction)
        predictions = np.array(predictions)
        return predictions

In [44]:
clf = DecisionTreeClassifier(max_depth=10,min_samples=2)

In [60]:
from pprint import pprint
pprint(clf.train(data.iloc[:,:-1],data.iloc[:,-1]))

{'Sex = male': [{'Fare <= 26.26875': [{'Age <= 13.5': [{'SibSp <= 2.0': [{'Parch <= 0.5': [{'Fare <= 15.014600000000002': [1,
                                                                                                                           0]},
                                                                                           1]},
                                                                         0]},
                                                       {'Embarked = C': [{'Fare <= 15.1479': [{'Age <= 29.5': [{'Age <= 28.75': [{'Age <= 22.5': [{'Parch <= 0.5': [{'Fare <= 5.61875': [0,
                                                                                                                                                                                         1]},
                                                                                                                                                                    0]},
                         

In [61]:
pred = clf.predict(test.iloc[:,:-1])

In [62]:
truth = test.values[:,-1]
a = pred == truth
a.sum()/a.size

0.8324022346368715