In [8]:
import numpy as np
import pandas as pd
from scipy.stats import mode

## Prob 1

In [None]:
def giniImpurity(y,classes):
    fk = np.array([(y == k).sum()/(1.*len(y)) for k in classes])
    return 1 - sum(fk**2.)
#classes = [0,1]
#giniImpurity(trainSurvive,classes)

## Prob 2

In [83]:
def split(D,y,p,x):
    if type(x) == float or type(x) == np.float64:
        D1 = D[D.ix[:,p] <= x]
        y1 = y[D.ix[:,p] <= x]
        D2 = D[D.ix[:,p] > x]
        y2 = y[D.ix[:,p] > x]
    else:
        D1 = D[D.ix[:,p] == x]
        y1 = y[D.ix[:,p] == x]
        D2 = D[D.ix[:,p] != x]
        y2 = y[D.ix[:,p] != x]
    return D1,y1,D2,y2


## Prob 3

In [84]:
def infoGain(D,y,D1,y1,D2,y2,classes):
    return (giniImpurity(y,classes) - len(y1)/(1.*len(D))*giniImpurity(y1,classes)
                             - len(y2)/(1.*len(D))*giniImpurity(y2,classes))
infoGain(train,trainSurvive,D1,y1,D2,y2,classes)

array([  9.92344045e-05])

## Prob 4

In [12]:
def optimalSplit(D,y,classes):
    best_x = None
    best_p = None
    max_I = -np.infty
    for p in xrange(len(D.columns)):
        vals = np.unique(D.ix[:,p])

        for x in vals:
            D1,y1,D2,y2 = split(D,y,p,x)
            I = infoGain(D,y,D1,y1,D2,y2,classes)
            if I > max_I:
                best_x = x
                best_p = p
                max_I = I
    return best_p,best_x
#p,x = optimalSplit(train,trainSurvive,classes)

## Prob 5 - 7

In [None]:
class Node:
    def __init__(self,D,y,classes,max_dep,tol,cur_dep=1):
        self.D = None
        
        self.tol = tol
        self.cur_dep = cur_dep
        self.max_dep = max_dep
        
        self.G = giniImpurity(y,classes)
        
        if self.G < self.tol or self.cur_dep == self.max_dep:
            self.label = mode(y).mode[0][0]
            self.leftChild = None
            self.rightChild = None
            self.p = None
            self.x = None
            
        else:
            self.label = None
            
            p,x = optimalSplit(D,y,classes)
            if p == None:
                self.label = mode(y).mode[0][0]
                self.leftChild = None
                self.rightChild = None
                self.p = None
                self.x = None
            else:
                D1,y1,D2,y2 = split(D,y,p,x)
                self.leftChild = Node(D1,y1,classes,max_dep,tol,cur_dep+1)
                self.rightChild = Node(D2,y2,classes,max_dep,tol,cur_dep+1)
                self.p = p
                self.x = x
    def printTree(self):
        if self.p != None:
            print " "*(self.cur_dep-1),self.p,self.x
            self.leftChild.printTree()
            self.rightChild.printTree()
        else:
            print " "*(self.cur_dep-1),self.label
    def classify(self,d):
        if self.label != None:
            return self.label
        else:
            if type(self.x) == float or type(self.x) == np.float64:
                if d[self.p] <= self.x:
                    return self.leftChild.classify(d)
                else:
                    return self.rightChild.classify(d)
            else:
                if d[self.p] == self.x:
                    return self.leftChild.classify(d)
                else:
                    return self.rightChild.classify(d)

## Prob 8

In [33]:
allData = pd.read_csv("titanic4real.csv")
train = allData[:int(len(allData)*.8)]
test = allData[int(len(allData)*.8):]

train = train.fillna(0)
trainSurvive = train[["Survived"]]
train = train[["Pclass","Sex","Age"]]
train = train.replace({'male': 1, 'female': 0})

test = test.fillna(0)
testSurvive = test[["Survived"]]
test = test[["Pclass","Sex","Age"]]
test = test.replace({'male': 1, 'female': 0})

classes = [0,1]

In [34]:
tree = Node(train,trainSurvive,classes,10,.1)

In [35]:
tree.printTree()

 1 0
  0 2.0
   0 1.0
    1.0
    2 55.0
     2 0.0
      1.0
      2 17.0
       1.0
       2 18.0
        1.0
        2 44.0
         2 42.0
          1.0
          0.0
         1.0
     0.0
   2 5.0
    2 2.0
     2 0.0
      1.0
      2 0.75
       1.0
       2 1.0
        0.0
        0.0
     1.0
    2 11.0
     0.0
     2 16.0
      2 15.0
       1.0
       1.0
      2 36.0
       2 21.0
        2 19.0
         2 18.5
          0.0
          1.0
         0.0
        2 24.0
         2 22.0
          1.0
          1.0
         2 32.0
          0.0
          1.0
       2 43.0
        2 38.0
         2 37.0
          0.0
          0.0
         0.0
        2 45.0
         0.0
         0.0
  0 1.0
   2 54.0
    2 47.0
     2 45.0
      2 0.0
       0.0
       2 13.0
        1.0
        2 36.0
         2 33.0
          0.0
          1.0
         2 42.0
          0.0
          0.0
      0.0
     2 49.0
      2 48.0
       1.0
       1.0
      2 50.0
       0.0
       2 53.0
        2 52.

In [58]:
labs = test.apply(tree.classify,axis=1)

In [91]:
labs.to_frame()
lv = labs.values
tv = testSurvive.values.reshape(lv.shape[0])
print (tv == lv).sum()/(1.*len(tv))

0.782442748092
