In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from collections import Counter
from sklearn.metrics import log_loss

class Node:
    def __init__(self, depth, max_depth, impurity, is_leaf=False):
        self.is_leaf = is_leaf
        self.depth = depth
        self.max_depth = max_depth
        self.impurity = impurity
        
    def stopping_criteria(self, X, y):
        if self.impurity == 'gini':
            return self.depth >= self.max_depth or np.unique(y).shape[0] == 1
        else:
            return self.depth >= self.max_depth
    
    def compute_gini_impurity(self, X, y):
        n = X.shape[0]
        prob = np.array(Counter(y).values()) / float(n)
        
        impurity = 1. - np.sum(prob**2)        
        return impurity
    
    def compute_mse_impurity(self, X, y):
        n = y.shape[0]
        impurity = np.var(y) * n
        return impurity
    
    def compute_impurity(self, X, y):
        if self.impurity == 'gini':
            impurity = self.compute_gini_impurity(X, y)
        else:
            impurity = self.compute_mse_impurity(X, y)
        return impurity
    
    def compute_impurity_change(self, feature, threshold, X, y):
        
        pred_X     = X[:, feature] <= threshold
        not_pred_X = X[:, feature] >  threshold
        
        X_left , y_left  = X[pred_X]    , y[pred_X]
        X_right, y_right = X[not_pred_X], y[not_pred_X]
                
        n, l, r = X.shape[0], X_left.shape[0], X_right.shape[0]
        
        change = float(l) / n * self.compute_impurity(X_left, y_left) + float(r) / n * self.compute_impurity(X_right, y_right)
        return change
    
    def get_predicate(self, X, y):
        m = X.shape[1]
        
        best_pair = None
        best_impurity = None
        
        for i in np.arange(m):
            
            thresholds = np.sort(np.unique(X[:, i]))
            for k in xrange(len(thresholds) - 1):
                thresholds[k] += thresholds[k + 1]
            thresholds = thresholds[:-1] / 2
            
            for t in thresholds:
                new_impurity = self.compute_impurity_change(i, t, X, y)
                if best_pair is None or new_impurity < best_impurity:
                    best_pair = (i, t)
                    best_impurity = new_impurity
        
        return best_pair
        
    def fit(self, X, y):
        if self.stopping_criteria(X, y):
            self.is_leaf = True
            if self.impurity == 'gini':
                self.c = Counter(y).most_common()[0][0]
            else:                
                self.average = np.mean(y)
        else:
            predicate = self.get_predicate(X, y)
            
            if predicate is None:
                self.is_leaf = True
                if self.impurity == 'gini':
                    self.c = Counter(y).most_common()[0][0]
                else:
                    self.average = np.mean(y)
                return
            
            self.feature, self.threshold = predicate
        
            pred_X     = X[:, self.feature] <= self.threshold
            not_pred_X = X[:, self.feature] >  self.threshold
        
            self.left = Node(self.depth + 1, max_depth=self.max_depth, impurity=self.impurity)
            self.left.fit(X[pred_X], y[pred_X])
            
            self.right = Node(self.depth + 1, max_depth=self.max_depth, impurity=self.impurity)
            self.right.fit(X[not_pred_X], y[not_pred_X])    
    
    def predict(self, X):
        if self.is_leaf:
            if self.impurity == 'gini':
                return [self.c for _ in X]
            else:            
                return [self.average for _ in X]
        else:
            pred = []
            for x in X:      
                if x[self.feature] <= self.threshold:
                    pred.extend(self.left.predict(np.array([x])))
                else:
                    pred.extend(self.right.predict(np.array([x])))
            return pred
        
class CART:
    def __init__(self, impurity='gini', max_depth=5):
        if impurity not in ['gini', 'mse']:
            raise ValueError("Only gini and mse criteria are supported")
        self.impurity = impurity
        self.max_depth = max_depth
    
    def fit(self, X, y):
        self.root = Node(1, max_depth=self.max_depth, impurity=self.impurity)
        self.root.fit(X, y)
        return self
        
    def predict(self, X):
        return np.array(self.root.predict(X))
    
    def score(self, X, y):        
        #y_pred = sigmoid(self.predict(X))
        y_pred = self.predict(X)
        #return log_loss(y, y_pred)
        return np.mean(y_pred == y)

In [3]:
df = pd.read_csv('iris.data', delimiter=',')[50:]
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
50,7.0,3.2,4.7,1.4,Iris-versicolor
51,6.4,3.2,4.5,1.5,Iris-versicolor
52,6.9,3.1,4.9,1.5,Iris-versicolor
53,5.5,2.3,4.0,1.3,Iris-versicolor
54,6.5,2.8,4.6,1.5,Iris-versicolor


In [4]:
classes = {
    "Iris-setosa": 0,
    "Iris-versicolor": 1,
    "Iris-virginica": 0
}

df["num_class"] = df["class"].apply(lambda s: classes[s])

In [5]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,num_class
count,100.0,100.0,100.0,100.0,100.0
mean,6.262,2.872,4.906,1.676,0.5
std,0.662834,0.332751,0.825578,0.424769,0.502519
min,4.9,2.0,3.0,1.0,0.0
25%,5.8,2.7,4.375,1.3,0.0
50%,6.3,2.9,4.9,1.6,0.5
75%,6.7,3.025,5.525,2.0,1.0
max,7.9,3.8,6.9,2.5,1.0


In [6]:
def shuffle(df, train_percent=0.8):
    X = np.copy(df.values)
    np.random.shuffle(X)
    
    X, y = X[:, :-1], X[:, -1]
    
    train_size = int(X.shape[0] * train_percent)
    
    X_train, y_train = X[:train_size, :], y[:train_size]
    X_test, y_test = X[train_size:, :], y[train_size:]
    
    return X_train, y_train, X_test, y_test

In [7]:
def sigmoid(x):
    return 1. / (1. + np.exp(-x))

In [25]:
X_train, y_train, X_test, y_test = shuffle(df.drop("class", axis=1), train_percent=0.6)
print "Train size: {}; Test size: {};".format(X_train.shape[0], X_test.shape[0])

Train size: 60; Test size: 40;


In [26]:
cart_clf = CART(max_depth=10)
cart_clf.fit(X_train, y_train)

<__main__.CART instance at 0x7f9313d33bd8>

In [27]:
pred = cart_clf.predict(X_test)

In [28]:
print "Log loss: {}".format(log_loss(y_test, pred))
print "Accuracy: {}".format(cart_clf.score(X_test, y_test))

Log loss: 2.59044820949
Accuracy: 0.925


In [29]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [30]:
pr = tree.predict(X_test)

In [31]:
#print "Log loss: {}".format(log_loss(y_test, sigmoid(pr)))
print "Log loss: {}".format(log_loss(y_test, pr))
print "Accuracy: {}".format(tree.score(X_test, y_test))

Log loss: 2.59044820949
Accuracy: 0.925


In [32]:
class RandomForest:
    def __init__(self, n_estimators=10, max_depth=10, poi=1.0, pof=1.0):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.poi = poi
        self.pof = pof
            
    def fit(self, X, y):
        self.estimators = []
        for _ in xrange(self.n_estimators):
            n, m = X.shape
            items    = np.sort(np.random.choice(n, n * self.poi))
            features = np.sort(np.random.choice(m, m * self.pof, replace=False))
                            
            estimator = CART(max_depth=self.max_depth)
            estimator.fit(X[:, features][items, :], y[items])
            
            self.estimators.append(estimator)            
    
    def predict(self, X):
        answer = 0
        for e in self.estimators:
            p = e.predict(X)
            p[p == 0] = -1
            answer += p
        answer /= self.n_estimators
        answer = np.sign(answer)        
        answer[answer == -1] = 0
        return answer
    
    def score(self, X, y):
        pred = self.predict(X)        
        return np.mean(pred == y)

In [33]:
forest_clf = RandomForest(n_estimators=100, max_depth=10)
forest_clf.fit(X_train, y_train)



In [34]:
forest_clf.score(X_test, y_test)

0.92500000000000004

In [35]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, max_depth=10)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [36]:
forest.score(X_test, y_test)

0.92500000000000004

In [37]:
import scipy

class GradientBoosting:
    def __init__(self, n_estimators=10, max_depth=5, mu=0.1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth        
        self.mu = mu
    
    def fit(self, X, y):
        self.estimators = []
        self.weigths = []
        
        estimator = CART(max_depth=self.max_depth, impurity='gini')
        estimator.fit(X, y)
        
        h = estimator.predict(X)
        for _ in xrange(self.n_estimators):            
            g = y - sigmoid(h)
            
            estimator = CART(max_depth=self.max_depth, impurity='mse')
            estimator.fit(X, g)
            
            self.estimators.append(estimator)
            
            #p = estimator.predict(X)            
            #func = lambda b: log_loss(y, h + b * p)
            #b = scipy.optimize.minimize(func, np.zeros((y.shape[0],))).x
            #print b
            #b = 1            
            b = self.mu
            
            self.weigths.append(b)

    
    def predict(self, X):
        answer = 0
        for (b, a) in zip(self.weigths, self.estimators):            
            answer += b * a.predict(X)
        answer /= self.n_estimators
        answer = np.sign(answer)
        answer[answer == -1] = 0
        return answer
    
    def score(self, X, y):
        pred = self.predict(X)
        return np.mean(pred == y)

In [38]:
grad_clf = GradientBoosting(n_estimators=100)
grad_clf.fit(X_train, y_train)

In [39]:
grad_clf.score(X_test, y_test)

0.92500000000000004

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
grad = GradientBoostingClassifier(n_estimators=100, max_depth=5)
grad.fit(X_train, y_train)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [41]:
grad.score(X_test, y_test)

0.92500000000000004

In [49]:
class VotingClassifier:
    def __init__(self, estimators, weigths):
        self.estimators = estimators
        self.weights = weigths
    
    def fit(self, X, y):
        for estimator in self.estimators:
            estimator.fit(X, y)
    
    def predict(self, X):
        answer = 0
        for weight, estimator in zip(self.weights, self.estimators):
            p = estimator.predict(X)
            p[p == 0] = -1
            answer += weight * p
        answer = np.sign(answer / len(self.estimators))
        answer[answer == -1] = 0
        return answer
    
    def score(self, X, y):
        pred = self.predict(X)
        return np.mean(pred == y)

In [50]:
vote_clf = VotingClassifier([forest_clf, grad_clf], [1.0, 1.0])
vote_clf.fit(X_train, y_train)



In [51]:
vote_clf.score(X_test, y_test)

0.92500000000000004