In [3]:
#setup kaggle cli by using 
#kg config -g -u $USER -p $PASSWoRD -c titanic
#kg download

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('data/train.csv')

In [6]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
X_df = data[['Age', 'Sex', 'PassengerId']]
Y_df = data['Survived']

In [8]:
X_df = X_df.set_index('PassengerId')

In [9]:
X_df.head()

Unnamed: 0_level_0,Age,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,22.0,male
2,38.0,female
3,26.0,female
4,35.0,female
5,35.0,male


Goal is to create a decision tree based off questions:    is  Age > x and Sex === y

1) create a question class with signature (question_type, x) => question in natural lang


2) create a function that inputs the records (rows) and computes impurity at a node
3 create a function  (node1,node2, question)  => information_gain

then a function that trains the tree

question

partition (question, dataset) => dataset1 dataset2

gini: rows => gini measure

info_gain: (false_rows, left_rows_ gini_impurity at begining row) => info_gain

find best split

 

In [10]:
class Question:

    def __init__(self, question_type, value):
        self.question_type = question_type
       
        self.value = value
        

    def __repr__(self):
        
        if self.question_type =="Age" :
            return "Age >= {} ?".format(self.value)
        elif self.question_type == "Sex":
            return "Sex == {} ?".format(self.value)
        
    def match(self, passenger):
        #passenger will be a DataFrame row
        if self.question_type =="Age" :
            return self.value > passenger.Age
        elif self.question_type == "Sex":
            return self.value == passenger.Sex

Some demos below to see that it works as intended.

In [11]:
q_age = Question("Age",1)

In [12]:
q_age

Age >= 1 ?

In [13]:
q_sex = Question("Sex", "male")

In [14]:
q_sex

Sex == male ?

In [15]:
passenger = X_df.ix[1]

In [16]:
passenger

Age      22
Sex    male
Name: 1, dtype: object

In [17]:
q_age.match(passenger)

False

In [18]:
q_age2 = Question("Age",23)
q_age2.match(passenger)

True

In [19]:
q_sex.match(passenger)

True

In [20]:
Y_df[Y_df==0].count()

549

In [23]:
def gini_impurity(Y_df):
    unique_values = Y_df.unique()
    #will be 0 (died) or 1 (survived)  
    died_count = Y_df[Y_df==0].count()
    survived_count = Y_df[Y_df==1].count()
    total_count = Y_df.count()
    
    result = 1 - (survived_count/total_count)**2 - (died_count/total_count)**2
    return result
    

In [24]:
gini_impurity(Y_df)

0.47301295786144276

In [26]:
gini_impurity(Y_df[Y_df==1])

0.0

Seems like it works.