# Random Forest

A popular method to combat overfitting in decision trees is random forest. We'll first implement a simple majority classifier and will then use it to build a random forest.

In [1]:
from C45 import C45, mean

## MajorityClassifier

This classifier takes a set of already fitted model. To predict the label for new data points, all models are asked and the answer with a relative majority is predicted.

In [2]:
from collections import Counter

class MajorityClassifier:
    def __init__(self, models):
        self.models = models
        
    def predict_single(self, x):
        ys = [model.predict_single(x) for model in self.models]
        return Counter(ys).most_common()[0][0]
    
    def predict(self, X):
        return [self.predict_single(x) for x in X]
    
    def score(self, X, y):
        """
        Returns the accuracy for predicting the given dataset X
        """
        
        correct = sum(self.predict(X) == y)
        return float(correct) / len(y)

## RandomForest

Random forest trains a bunch of decision trees on different training data and then gives them to a majority classifier.

In [3]:
import random

class RandomForest(MajorityClassifier):
    def __init__(self, num_trees, continuous, max_depth=float("inf")):
        self.models = [C45(continuous=continuous, max_depth=max_depth) for _ in range(num_trees)]
        
    def fit(self, X, y, k=0.1):
        num_train = int(len(X) * k)
        
        for model in self.models:
            sub = [random.choice(range(len(X))) for _ in range(num_train)]
            X_sub = X[sub]
            y_sub = y[sub]
            
            model.fit(X_sub, y_sub)

    def prune(self, X_val, y_val):
        for model in self.models:
            model.prune(X_val, y_val)

## Titanic dataset

Again, we'll use the titanic dataset to explore how well our algorithm works.

In [4]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.preprocessing import Imputer, LabelEncoder
from sklearn.model_selection import train_test_split

In [5]:
def preprocess(data, encode_labels=False, impute=False):
    X = data.drop(["Survived", "Name", "Ticket", "Cabin"], 1)    
    
    if encode_labels: # for sklearn
        X = X.apply(LabelEncoder().fit_transform)
    
    print X.head(10)
    
    X = X.as_matrix()
    
    if impute:
        X = Imputer().fit_transform(X)
            
    return X

In [6]:
data = DataFrame.from_csv("./titanic/train.csv")
y = data["Survived"].as_matrix()
X = preprocess(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=3)

             Pclass     Sex   Age  SibSp  Parch     Fare Embarked
PassengerId                                                      
1                 3    male  22.0      1      0   7.2500        S
2                 1  female  38.0      1      0  71.2833        C
3                 3  female  26.0      0      0   7.9250        S
4                 1  female  35.0      1      0  53.1000        S
5                 3    male  35.0      0      0   8.0500        S
6                 3    male   NaN      0      0   8.4583        Q
7                 1    male  54.0      0      0  51.8625        S
8                 3    male   2.0      3      1  21.0750        S
9                 3  female  27.0      0      2  11.1333        S
10                2  female  14.0      1      0  30.0708        C


### Without Pruning

In [7]:
for k in [0.02, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.5]:
    clf = RandomForest(num_trees=10, continuous={2, 5})
    clf.fit(X_train, y_train, k=k)

    acc = mean([clf.score(X_test, y_test) for _ in range(100)])
    print "k=%.2f got %.5f accuracy" % (k, acc)

k=0.02 got 0.75128 accuracy
k=0.05 got 0.72279 accuracy
k=0.10 got 0.73642 accuracy
k=0.15 got 0.75441 accuracy
k=0.20 got 0.74855 accuracy
k=0.25 got 0.78922 accuracy
k=0.30 got 0.78749 accuracy
k=0.35 got 0.80285 accuracy
k=0.50 got 0.79860 accuracy


### With Pruning

In [8]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=3)

for k in [0.02, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.5]:
    clf = RandomForest(num_trees=10, continuous={2, 5})
    clf.fit(X_train_sub, y_train_sub, k=k)

    clf.prune(X_val, y_val)

    acc = mean([clf.score(X_test, y_test) for _ in range(100)])
    print "k=%.2f got %.5f accuracy" % (k, acc)

k=0.02 got 0.70950 accuracy
k=0.05 got 0.78726 accuracy
k=0.10 got 0.78212 accuracy
k=0.15 got 0.78212 accuracy
k=0.20 got 0.79358 accuracy
k=0.25 got 0.78212 accuracy
k=0.30 got 0.78994 accuracy
k=0.35 got 0.78184 accuracy
k=0.50 got 0.78531 accuracy


In [9]:
clf45 = C45(continuous={2, 5})
clf45.fit(X_train, y_train)
mean([clf45.score(X_test, y_test) for _ in range(100)])

0.7788268156424585

## Alternative RandomForest

A popular alternative to just splitting the training data is using different features for each trained decision tree.

In [10]:
class FeatureRandomForest(RandomForest):
    def fit(self, X, y, num_features, p=None):
        self.num_total_features = X.shape[1]
        self.features = {}
        
        all_features = range(self.num_total_features)
        
        for model in self.models:
            self.features[model] = set(np.random.choice(all_features, size=num_features, p=p, replace=False))
            
            X_cut = self._cut_data(model, X)
            
            model.fit(X_cut, y)
            
    def _cut_data(self, model, X):
        features = self.features[model]
        cut_features = [feature for feature in range(self.num_total_features) if feature not in features]
        
        X_cut = X.copy()
        X_cut[:, cut_features] = 0
        return X_cut
    
    def prune(self, X_val, y_val):
        for model in self.models:
            X_cut = self._cut_data(model, X_val)
            model.prune(X_cut, y_val)

Now there's a lot of randomness involved because some trees might only get features that are not useful. With few trees (e.g. two), this is especially bad.

In [11]:
accs = []

for _ in range(10):
    clf = FeatureRandomForest(num_trees=2, continuous={2, 5})
    clf.fit(X_train, y_train, num_features=4)
    accs.append(mean([clf.score(X_test, y_test) for _ in range(100)]))

max(accs)

0.7988826815642457

In [12]:
accs

[0.771452513966481,
 0.6203910614525137,
 0.7430167597765371,
 0.7988826815642457,
 0.6608938547486031,
 0.6770391061452513,
 0.7877094972067051,
 0.6902234636871508,
 0.6757541899441344,
 0.6674860335195528]

###  With Pruning

In [13]:
for num_features in range(2, 8):
    clf = FeatureRandomForest(num_trees=10, continuous={2, 5})
    clf.fit(X_train_sub, y_train_sub, num_features=num_features)

    clf.prune(X_val, y_val)

    acc = mean([clf.score(X_test, y_test) for _ in range(100)])
    
    print "forest with %d features has %.4f accuracy" % (num_features, acc)

forest with 2 features has 0.7758 accuracy
forest with 3 features has 0.7901 accuracy
forest with 4 features has 0.7307 accuracy
forest with 5 features has 0.7842 accuracy
forest with 6 features has 0.7961 accuracy
forest with 7 features has 0.8214 accuracy


In [14]:
for num_features in reversed(range(4, 8)):
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=3)

    clf = FeatureRandomForest(num_trees=10, continuous={2, 5})
    clf.fit(X_train_sub, y_train_sub, num_features=num_features)

    clf.prune(X_val, y_val)

    acc = mean([clf.score(X_test, y_test) for _ in range(100)])
    
    print "forest with %d features has %.4f accuracy" % (num_features, acc)

forest with 7 features has 0.8323 accuracy
forest with 6 features has 0.8011 accuracy
forest with 5 features has 0.7863 accuracy
forest with 4 features has 0.7923 accuracy


### Without Pruning

In [15]:
for num_features in range(2, 8):
    clf = FeatureRandomForest(num_trees=10, continuous={2, 5})
    clf.fit(X_train, y_train, num_features=num_features)

    acc = mean([clf.score(X_test, y_test) for _ in range(100)])
    
    print "forest with %d features has %.4f accuracy" % (num_features, acc)

forest with 2 features has 0.7020 accuracy
forest with 3 features has 0.7563 accuracy
forest with 4 features has 0.7229 accuracy
forest with 5 features has 0.7943 accuracy
forest with 6 features has 0.7527 accuracy
forest with 7 features has 0.7837 accuracy


### Prior weights for features

In [16]:
# Pclass     Sex   Age  SibSp  Parch     Fare Embarked
p = [0.15, 0.35, 0.1, 0.05, 0.05, 0.2, 0.1]

clf = FeatureRandomForest(num_trees=10, continuous={2, 5})
clf.fit(X_train_sub, y_train_sub, num_features=4, p=p)
clf.prune(X_val, y_val)
mean([clf.score(X_test, y_test) for _ in range(100)])

0.8039106145251403

In [17]:
# Pclass     Sex   Age  SibSp  Parch     Fare Embarked
p = [0.2, 0.3, 0.1, 0.1, 0.1, 0.1, 0.1]

clf = FeatureRandomForest(num_trees=10, continuous={2, 5})
clf.fit(X_train_sub, y_train_sub, num_features=4, p=p)
clf.prune(X_val, y_val)
mean([clf.score(X_test, y_test) for _ in range(100)])

0.7987150837988832

## Checking with sklearn

Our best accuracy is higher than sklearn's, but we did a lot more hyperparameter for our implementation. sklearn's implementation is also a lot less vulnerable to random flunctuations.

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
X = preprocess(data, encode_labels=True, impute=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

             Pclass  Sex  Age  SibSp  Parch  Fare  Embarked
PassengerId                                                
1                 2    1   28      1      0    18         3
2                 0    0   51      1      0   207         1
3                 2    0   34      0      0    41         3
4                 0    0   47      1      0   189         3
5                 2    1   47      0      0    43         3
6                 2    1  110      0      0    51         2
7                 0    1   69      0      0   186         3
8                 2    1    6      3      1   124         3
9                 2    0   35      0      2    74         3
10                1    0   18      1      0   154         1


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [20]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print "train accuracy = %.5f" % clf.score(X_train, y_train)
print "test accuracy = %.5f" % clf.score(X_test, y_test)

train accuracy = 0.97191
test accuracy = 0.81564
