In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import tree

from matplotlib import pyplot
import numpy as np
import random
%matplotlib inline
pd.options.mode.chained_assignment = None

In [38]:
class BoostedRandomForest :
    def __init__(self, T=50, sample_portion=0.6, depth_max=5, criterion='entropy', weight_update=True, boosting=True, debug_msg=False, verbose=False) :
        # Inputs 
        # Max number of trees to be trained
        self.T = T
        # Portion of sampled subet from training data
        self.sample_portion = sample_portion
        # Max depth of each tree
        self.depth_max = depth_max
        # Determine if tree weights are updated during training
        self.weight_update = weight_update
        # Criterion to train a random tree
        self.criterion = criterion
        # Determine if boosting is applied during training
        self.boosting = boosting
        # Determine if debug messages are printed
        self.debug_msg = debug_msg
        # Enable verbose output of training process
        self.verbose = verbose
        
        # List for training results
        # List of trained randome tree classifiers
        self.clfs = []
        # List of weights to trained trees
        self.alphas = []
        # features selected for each tree in forest
        self.feature_record = pd.DataFrame()
    
    
    
    # Train boosted random forest
    def fit(self, X, y) :
        # Number of features
        m = X.shape[0]
        # Number of examples
        N = X.shape[1]
        feature_portion = (np.round(np.sqrt(N))/N) *3
        
        # Initialize training sample weights
        W = [1.0/m for i in range(0,m,1)]
        W = pd.DataFrame({'Weight':list(W)}, index=X.index)
        
        # Print debug messages
        if self.debug_msg :
            print("Weight Update:", self.weight_update)
            print("Tree Boosting in forming forest:", self.boosting)
            print("max depthmax:", self.depth_max)
            print("feature_sampling:", feature_portion)
            print("training_sample:", self.sample_portion)
            print("--------------------------")
            
        for i in range(1,self.T+1):
            print("in loop ", i )
            # Prepare training sample subset (bagging)
            X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X, y, test_size=self.sample_portion)
            # Sample feautres to be used for current tree
            selected_features =  [random.randint(0,N-1) for j in range(0,int(round(N*feature_portion)))]
            # Save selected features for the tree
            self.feature_record = self.feature_record.append(pd.DataFrame([selected_features]), ignore_index=True)    
            X_train_sample = X_train_sample.iloc[:,selected_features]
            X_test_sample = X_test_sample.iloc[:,selected_features]
            
            # Prepare tree classifier
            clf = tree.DecisionTreeClassifier(criterion=self.criterion, max_depth=self.depth_max)
            # Weight of training samples
            w_ = W.loc[X_train_sample.index,"Weight"].tolist()

            # Train decision tree
            clf.fit(X=X_train_sample, y=y_train_sample,sample_weight=w_)
            # Make prediction
            pred = clf.predict(X_train.iloc[:,selected_features])

            # Calculate weighted error rate of current tree
            eps = sum(np.array(W)[(np.ravel(pred) != np.ravel(y))]) / sum(np.array(W))
            if self.debug_msg :
                print("eps: ", eps)
            
            # Stop training if the error rate is too small
            if eps < 1e-20 :
                if self.debug_msg :
                    print("eps == {}. Break".format(eps))
                break
                
            # Compute weight of decision tree
            alpha = (0.5)*np.log( (n_class-1)*(1-eps)/eps )
            if self.debug_msg:
                print("Alpha:", alpha)

            # Update weight of training sample
            if alpha > 0 :
                # Calculate alphas according to correctness of predictions
                exp_alphas = [ np.exp(-alpha) if a==p else np.exp(alpha) for a,p in zip(np.ravel(y), pred) ]
                
                # Update training sample weights
                if self.weight_update==True:
                    #with updating
                    W = m*np.multiply(W, exp_alphas) / np.sum(np.multiply(W, exp_alphas))
                else:
                    #weihtout updating
                    W = [1.0/m for i in range(0,m,1)]
                    W = pd.DataFrame({'Weight':list(W)}, index=X_train.index)

                # Save trained tree to list 
                self.clfs.append(clf)
                # Save alpha to list
                self.alphas.append(alpha)

                if self.boosting==True:
                    alphas_ = self.alphas/sum(self.alphas)

            else :
                # If alpha < 0, reject the tree
                if self.debug_msg :
                    print("Tree {} is rejected.".format(i))
    
    
    def ensemble_predict(self, X) :
        # Normalize alphas
        alphas_ = self.alphas / sum(self.alphas)
        
        # Calculate class probabilities
        prob_mat=np.empty([0, X.shape[0]])
        for i in range(0,len(self.clfs)):
            prob = self.clfs[i].predict_proba(X.iloc[:, list(self.feature_record.iloc[i,:])])[:,1]
            prob_mat = np.vstack((prob_mat,prob))
        prob_mat = np.transpose(prob_mat)
        ensemble_prob = np.matmul(prob_mat,np.array(alphas_))
        
        # Give predictions
        ensemble_pred = ensemble_prob
        ensemble_pred[ensemble_prob>=0.5] = 1
        ensemble_pred[ensemble_prob<0.5] = 0
        
        return ensemble_pred
    
        
    # Give predictions with random trees
    def RF_predict(self, X) :
        # Calculate class probabilities
        prob_mat=np.empty([0, X.shape[0]])
        for i in range(0,len(self.clfs)):
            prob = self.clfs[i].predict_proba(X.iloc[:,list(self.feature_record_.iloc[i,:])])[:,1]
            prob_mat = np.vstack ((prob_mat,prob))
        prob_mat = np.transpose(prob_mat)
        
        if len(self.clfs)>1:
            ensemble_prob = np.mean(prob_mat,axis=1)
        else:
            ensemble_prob = prob_mat
            
        # Give predictions
        ensemble_pred= ensemble_prob
        ensemble_pred[ensemble_prob>=0.5]=1
        ensemble_pred[ensemble_prob<0.5]=0
        
        return ensemble_pred
        

In [3]:
#bank = pd.read_csv('bank-full.csv',sep=";")
bank = pd.read_csv('spambase.csv',sep=",")

In [4]:
m = bank.shape[1]

# Remove unwanted features
X = bank.iloc[:,0:48]
#X = bank.iloc[:,0:(m-1)]
y = bank.iloc[:,(m-1):]
n_class = len(np.unique(y))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [6]:
X_train_onehot = pd.get_dummies(X_train)
X_test_onehot = pd.get_dummies(X_test)
#y_train.loc[:,'y'] = y_train.loc[:,'y'].map({'no':0,'yes':1})
#y_test.loc[:,'y'] = y_test.loc[:,'y'].map({'no':0,'yes':1})

In [39]:
brf = BoostedRandomForest(debug_msg=True, verbose=True)

In [28]:
brf.fit(X_train_onehot, y_train)

Weight Update: True
Tree Boosting in forming forest: True
max depthmax: 5
feature_sampling: 0.4375
training_sample: 0.6
--------------------------
in loop  1
eps:  [0.16985507]
Alpha: [0.79332737]
in loop  2
eps:  [0.20164464]
Alpha: [0.68802345]
in loop  3
eps:  [0.39779004]
Alpha: [0.20734092]
in loop  4
eps:  [0.31746802]
Alpha: [0.38271614]
in loop  5
eps:  [0.33583793]
Alpha: [0.34094875]
in loop  6
eps:  [0.30510466]
Alpha: [0.41155319]
in loop  7
eps:  [0.37844238]
Alpha: [0.24808239]
in loop  8
eps:  [0.35838571]
Alpha: [0.29118877]
in loop  9
eps:  [0.37402548]
Alpha: [0.25749287]
in loop  10
eps:  [0.3776413]
Alpha: [0.24978591]
in loop  11
eps:  [0.27346912]
Alpha: [0.48854615]
in loop  12
eps:  [0.36935998]
Alpha: [0.26748175]
in loop  13
eps:  [0.43880191]
Alpha: [0.12301292]
in loop  14
eps:  [0.37116538]
Alpha: [0.26361027]
in loop  15
eps:  [0.38312091]
Alpha: [0.2381612]
in loop  16
eps:  [0.42349206]
Alpha: [0.15422718]
in loop  17
eps:  [0.41501036]
Alpha: [0.1716453

In [42]:
pred = brf.ensemble_predict(X_test)


TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [31]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred))

0.9391833188531712


In [47]:
pred_ = brf.RF_predict(X_test)

In [48]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred_))


ValueError: Classification metrics can't handle a mix of binary and unknown targets

0.36750651607298


[]
