In [None]:
import numpy as np
import pandas as pd
from scipy.stats import mode

In addition to randomizing the variables it splits on, I also made my random forest choose a random data set to build each tree

In [2]:
def giniImpurity(y,classes):
    fk = np.array([(y == k).sum()/(1.*len(y)) for k in classes])
    return 1 - sum(fk**2.)

def split(D,y,p,x):
    if type(x) == float or type(x) == np.float64:
        D1 = D[D.ix[:,p] <= x]
        y1 = y[D.ix[:,p] <= x]
        D2 = D[D.ix[:,p] > x]
        y2 = y[D.ix[:,p] > x]
    else:
        D1 = D[D.ix[:,p] == x]
        y1 = y[D.ix[:,p] == x]
        D2 = D[D.ix[:,p] != x]
        y2 = y[D.ix[:,p] != x]
    return D1,y1,D2,y2

def infoGain(D,y,D1,y1,D2,y2,classes):
    return (giniImpurity(y,classes) - len(y1)/(1.*len(D))*giniImpurity(y1,classes)
                             - len(y2)/(1.*len(D))*giniImpurity(y2,classes))

def optimalSplit(D,y,classes,avail_vars):
    best_x = None
    best_p = None
    max_I = -np.infty
    for p in avail_vars:
        vals = np.unique(D.ix[:,p])

        for x in vals:
            D1,y1,D2,y2 = split(D,y,p,x)
            I = infoGain(D,y,D1,y1,D2,y2,classes)
            if I > max_I:
                best_x = x
                best_p = p
                max_I = I
    return best_p,best_x

class ForestNode:
    def __init__(self,D,y,classes,max_dep,num_vars,tol,cur_dep=1,avail_vars=None):
        self.D = None
        avail = avail_vars
        if avail == None:
            avail = range(D.shape[1])
        self.avail_vars = []
        for p in avail:
            if len(np.unique(D.ix[:,p])) > 1:
                self.avail_vars.append(p)
        self.avail_vars = np.array(self.avail_vars)
        
        
        self.tol = tol
        self.cur_dep = cur_dep
        self.max_dep = max_dep
        
        self.G = giniImpurity(y,classes)
        
        if self.G < self.tol or self.cur_dep == self.max_dep or len(self.avail_vars) < num_vars:
            self.label = mode(y).mode[0][0]
            self.leftChild = None
            self.rightChild = None
            self.p = None
            self.x = None
            
        else:
            self.label = None
            split_vars = np.random.permutation(self.avail_vars)[:num_vars]
            
            p,x = optimalSplit(D,y,classes,split_vars)
            """
            if p == None:
                self.label = mode(y).mode[0][0]
                self.leftChild = None
                self.rightChild = None
                self.p = None
                self.x = None
            else:\
            """
            D1,y1,D2,y2 = split(D,y,p,x)
            self.leftChild = ForestNode(D1,y1,classes,max_dep,num_vars,tol,cur_dep+1,avail_vars = self.avail_vars)
            self.rightChild = ForestNode(D2,y2,classes,max_dep,num_vars,tol,cur_dep+1,avail_vars = self.avail_vars)
            self.p = p
            self.x = x
    def printTree(self):
        if self.p != None:
            print " "*(self.cur_dep-1),self.p,self.x
            self.leftChild.printTree()
            self.rightChild.printTree()
        else:
            print " "*(self.cur_dep-1),self.label
    def classify(self,d):
        if self.label != None:
            return self.label
        else:
            if type(self.x) == float or type(self.x) == np.float64:
                if d[self.p] <= self.x:
                    return self.leftChild.classify(d)
                else:
                    return self.rightChild.classify(d)
            else:
                if d[self.p] == self.x:
                    return self.leftChild.classify(d)
                else:
                    return self.rightChild.classify(d)
                
class Forest:
    def __init__(self,D,y,classes, max_dep,tol, num_trees,num_datums,num_vars):
        """
        Train a collection of random trees.
        Parameters
        ----------
        data : ndarray of shape (n,k)
        Each row is an observation.
        targets : ndarray of shape (K,)
        The possible labels or classes.
        Gini : float
        The Gini impurity tolerance
        max_depth : int
        The maximum depth for the the trees
        num_trees : int
        The number of trees in the forest.
        num_vars : int
        The number of variables randomly selected at each node.
        """
        self.trees = []
        for i in xrange(num_trees):
            print "Building tree ",i
            sort = np.argsort(np.random.rand(D.shape[0]))[:num_datums]
            tree = ForestNode(D.ix[sort],y.ix[sort],classes,max_dep,num_vars,tol)
            self.trees.append(tree)
            
    def classify(self,d):
        votes = np.array([tree.classify(d) for tree in self.trees])
        return mode(votes).mode[0]

In [3]:
allData = pd.read_csv("titanic4real.csv")
train = allData[:int(len(allData)*.8)]
test = allData[int(len(allData)*.8):]

train = train.fillna(0)
trainSurvive = train[["Survived"]]
train = train[["Pclass","Sex","Age"]]
train = train.replace({'male': 1, 'female': 0})

test = test.fillna(0)
testSurvive = test[["Survived"]]
test = test[["Pclass","Sex","Age"]]
test = test.replace({'male': 1, 'female': 0})

classes = [0,1]

In [4]:
forest = Forest(train,trainSurvive,classes,10,.1,100,50,2)

Building tree  0




Building tree  1
Building tree  2
Building tree  3
Building tree  4
Building tree  5
Building tree  6
Building tree  7
Building tree  8
Building tree  9
Building tree  10
Building tree  11
Building tree  12
Building tree  13
Building tree  14
Building tree  15
Building tree  16
Building tree  17
Building tree  18
Building tree  19
Building tree  20
Building tree  21
Building tree  22
Building tree  23
Building tree  24
Building tree  25
Building tree  26
Building tree  27
Building tree  28
Building tree  29
Building tree  30
Building tree  31
Building tree  32
Building tree  33
Building tree  34
Building tree  35
Building tree  36
Building tree  37
Building tree  38
Building tree  39
Building tree  40
Building tree  41
Building tree  42
Building tree  43
Building tree  44
Building tree  45
Building tree  46
Building tree  47
Building tree  48
Building tree  49
Building tree  50
Building tree  51
Building tree  52
Building tree  53
Building tree  54
Building tree  55
Building tree  56
B

In [5]:
labs = test.apply(forest.classify,axis=1)

In [6]:
labs.to_frame()
lv = labs.values
tv = testSurvive.values.reshape(lv.shape[0])
print (tv == lv).sum()/(1.*len(tv))

0.797709923664


Random forests really didn't do much better than regular classification trees