In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

##  Helper Functions

### Getting the potential split

In [2]:
def get_potential_splits(data,random_subspace):
    
    potential_splits = {}
    column_indices = list(range(data.shape[1]-1))
    
    
    if random_subspace and random_subspace < data.shape[1]:
        column_indices = random.sample(population = column_indices,k = random_subspace)
    
    for column_index in  column_indices :
            
            values =data[:,column_index] 
            
            if FEATURE_TYPES[column_index] == 'Continious':
                
                unique_values = np.unique(values)
                potential_splits[column_index] = []
                
                for i in range(len(unique_values)-1):
                    current_value = unique_values[i]
                    next_value = unique_values[i+1]
                    potential_split = (current_value+next_value)/2
                
                    potential_splits[column_index].append(potential_split)
            
            else:
                potential_splits[column_index]=list(set(values))
             
            
    return potential_splits

### Checking type of features

In [3]:
def determine_type_of_feature(df):
    
    feature_types = []
    threshold = 15
    
    for feature_names in list(df.columns)[:-1]:
        
        unique_values =df[feature_names].unique()
            
        if(len(unique_values)<=threshold)or isinstance(unique_values[0],str):
            feature_types.append('Categorical')
        else:
            feature_types.append('Continious')
    return feature_types

### Split Function

In [4]:
def split_data(data,split_column,split_value):
    
    values = data[:,split_column]
    type_of_feature = FEATURE_TYPES[split_column] 
    
    if type_of_feature == 'Continious':
        data_above = data[values > split_value]
        data_below = data[values <= split_value]
    else:
        data_below = data[values == split_value]
        data_above = data[values != split_value]
    return data_below,data_above

## Metric Functions

### Gini Index

In [5]:
def gini(data):
    
    label_column= data[:,-1]
    _,counts = np.unique(label_column,return_counts=True)
    
    p=counts/counts.sum()
    gini =1- np.dot(p,p)
    
    return gini

### Entropy

In [6]:
def entropy(data):
    
    label_columns = data[:,-1]
    _,counts = np.unique(label_columns,return_counts= True)
    
    p = counts/counts.sum()
    entropy = sum(p*-np.log2(p))
    
    
    return entropy

### Overall Metric

In [7]:
def overall_metric(data_below,data_above,metric_function):
    
    n=len(data_above)+len(data_below)
    p_data_below = len(data_below)/n
    p_data_above = len(data_above)/n
    
    overall_metric = p_data_above*metric_function(data_above) + p_data_below*metric_function(data_below)
    
    return overall_metric

## Getting the best split

In [8]:
def get_best_split(data,potential_splits,metric_function = gini):
    
    first_iteration = True
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            
            data_below,data_above = split_data(data,split_column=column_index,split_value = value)
            current_metric = overall_metric(data_above,data_below,metric_function)
            
            if first_iteration:
                
                best_metric = current_metric
                first_iteration = False
            
            if current_metric <= best_metric :
                
                best_metric = current_metric
                best_column =column_index
                best_value = value
                
                
    return best_column,best_value

### Check Purity

In [9]:
def  check_purity(data):
    label_columns = data[:,-1]
    
    if len(np.unique(label_columns))==1:
        return True
    else:
        return False

### Creating Leaf

In [10]:
def create_leaf(data):
    
    label_columns = data[:,-1]
    unique_labels,counts = np.unique(label_columns,return_counts =True)
    
    index = counts.argmax()
    leaf = unique_labels[index]
    
    return leaf

### Train Test Split

In [11]:
def train_test_split(data,split_ratio = 0.7,random_state=123):
    
    np.random.seed(random_state)
    indices = np.random.rand(len(data))<split_ratio
    
    return data[indices],data[~indices]

### Bootstrapping

In [12]:
def bootstrap(df,n_bootstrap,random_state = 1729):
    
    np.random.seed(random_state)
    indices =np.random.randint(low=0,high=len(df),size=n_bootstrap)
    
    return df.iloc[indices,:]

## Decision Tree Algorithm

In [13]:
def decision_tree_algorithm(df,counter =0, max_depth =5,min_samples = 10,random_subspace=None):
    
    if counter == 0:
    
        global COLUMN_NAMES,FEATURE_TYPES
        COLUMN_NAMES = list(df.columns)
        FEATURE_TYPES = determine_type_of_feature(df)
        data =df.values
    
    else:
        data =df
    
    if (check_purity(data)) or (counter == max_depth) or (len(data) < min_samples):
        return create_leaf(data)
    
    else:
        
        counter += 1
        potential_splits = get_potential_splits(data,random_subspace)
        column_index,split_value = get_best_split(data,potential_splits)
        data_below,data_above = split_data(data, column_index, split_value)
         
        if len(data_below)==0 or len(data_above)==0 :
            return create_leaf(data)
        
        
        type_of_feature = FEATURE_TYPES[column_index]
        if type_of_feature == 'Continious':
            question = "{} <= {}".format(column_index,split_value)
        else:
            question ="{} = {}".format(column_index,split_value)
        sub_tree={question:[]}
        
        yes_answer = decision_tree_algorithm(data_below,counter,random_subspace)
        no_answer = decision_tree_algorithm(data_above,counter,random_subspace)
        
        if yes_answer == no_answer:
            sub_tree =yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
       
        return sub_tree

### Decision Tree Classifier

In [14]:
def decision_tree_classifer(example,tree):
    question = list(tree.keys())[0]
    column_index,comparison_operator,value =question.split()
    column_index =int(column_index)
    
    if comparison_operator == "<=":
        if example[column_index] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    
    else:
        if str(example[column_index]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    if not isinstance(answer, dict):
        return answer
    
    else:
        residual_tree = answer
        return decision_tree_classifer(example, residual_tree)

## Random Forest Algorithm

In [15]:
def random_forest_algorithm(train_df,n_trees,n_bootstrap,n_features,max_depth=5):
    
    forest = []
    for i in range(n_trees):
        
        df_bootstrapped = bootstrap(train_df,n_bootstrap)
        tree = decision_tree_algorithm(df = df_bootstrapped,random_subspace = n_features, max_depth = max_depth)
        forest.append(tree)
    return forest

### Random Forest Classifier

In [28]:
def random_tree_classifier(example,forest):
    
    results =[]
    for index in range(len(forest)):
        
        result = decision_tree_classifer(example, forest[index] )
        results.append(result)
        
    mode = max(set(results),key=results.count)
    return mode

## Accuracy

In [65]:
def classify_data(test_df,forest):
    
    Predictions = test_df.apply(func = random_tree_classifier, axis = 1, raw=True,args=(forest,))
    
    return Predictions

In [66]:
def calculate_Accuracy(labels,predictions):
        
 
    accuracy = np.array(labels == predictions).mean()
    
    return accuracy