<a href="https://colab.research.google.com/github/dvircohen0/Machine-Learning-Algorithms-From-Scratch/blob/main/random_forrest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as  pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

In [77]:
breast_cancer = load_breast_cancer()

data=breast_cancer.data
target=breast_cancer.target

X_train,X_test,y_train,y_test= train_test_split(data,target)

y_train=y_train.reshape(1,len(y_train))
X_train =np.concatenate((X_train,y_train.T), axis=1) 

In [61]:
number_of_random_features = 4
number_of_trees =50
min_samples=2
max_depth=4

In [28]:
def create_subsample(data,number_of_features):
    number_of_columns = data.shape[1]
    random_indices = np.random.choice(number_of_columns-1 ,
                                      size=number_of_features,
                                      replace=False)
    ind_map = {}
    for i,ind in enumerate(random_indices):
        ind_map[i] = ind
    return ind_map

In [29]:
def split(data,i,value):
    left = data[np.where(data[:,i] >= value)]
    right = data[np.where(data[:,i] < value)]
    return left,right

In [30]:
def check_purity(X):
    label_column = X[:,-1]
    unique_classes = np.unique(label_column)
    if len(unique_classes) == 1:
        return True
    else:
        return False

In [31]:
def classify_data(data):
    label_column = data[:,-1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    return classification

In [32]:
def gini(Y1,Y2):
    size_L=len(Y1)
    size_R=len(Y2)
    pos_L=(Y1 == 1).sum()
    neg_L=size_L-pos_L
    pos_R=(Y2 == 1).sum()
    neg_R=size_R-pos_R
    P_l=1-1*(pos_L/size_L)**2 -1*(neg_L/size_L)**2 
    P_r=1-1*(pos_R/size_R)**2 -1*(neg_R/size_R)**2 
    gini_value=(P_l*size_L/(size_L+size_R))+(P_r*size_R/(size_L+size_R))
    return gini_value

In [33]:
def find_best_gini(X):
    mean1,gini1=[],[]
    for i in range(X.shape[1]-1):
        mean = X[:,i].mean()
        mean1.append(mean)
        L,R=split(X,i,mean)
        gini_val=gini(L[:,-1],R[:,-1])
        gini1.append(gini_val)
    return gini1.index(min(gini1)),mean1[gini1.index(min(gini1))]

In [52]:
def decision_tree_algorithm(data,ind_map, counter, min_samples, max_depth):

    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        return classification
    else:    
        counter += 1
        index,value=find_best_gini(data)
        L,R = split(data, index, value)
        question = "{} <= {}".format(ind_map[index], value)
        sub_tree = {question: []}
        yes_answer = decision_tree_algorithm(L,ind_map, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(R,ind_map, counter, min_samples, max_depth)
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        return sub_tree

In [53]:
def random_forest(data,n):
    tree=[]
    for i in range(n):
        random_in = create_subsample(data,number_of_random_features)
        new_data = data[:,[*random_in.values()]]
        y_new=data[:,-1].reshape(1,len(data[:,-1]))
        new_data =np.concatenate((new_data, y_new.T), axis=1)
        tree.append(decision_tree_algorithm(new_data,
                                            random_in,
                                            0,
                                            min_samples,
                                            max_depth))
    return tree

In [54]:
def classify(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")
    feature_name=int(feature_name)
    if example[feature_name] <= float(value):
        answer = tree[question][1]
    else:
        answer = tree[question][0]
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return classify(example, residual_tree)

In [55]:
def test_forrest(data,trees):
    result=[]
    for i in range(len(data)):
        vote = []
        for j in range(len(trees)):
            vote.append(classify(data[i,:],trees[j]))
        counts = np.bincount(vote)
        result.append(np.argmax(counts))
    return result

In [56]:
def accuracy(predict,actual):
    count=0
    for i in range(len(predict)):
        if predict[i]==actual[i]:
            count+=1
    return(count/len(predict))

In [80]:

result = []
trees=random_forest(X_train,number_of_trees)
y_pred = test_forrest(X_test,trees)

print('accuracy percentage {:.2%}'.format(accuracy(y_pred,y_test)))

accuracy percentage 96.50%


In [79]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(max_depth=4, random_state=0)
RF.fit(X_train[:,:-1], np.ravel(y_train))

print('Sklearn accuracy percentage {:.2%}'.format(RF.score(X_test, y_test)))

Sklearn accuracy percentage 97.20%
