<a href="https://colab.research.google.com/github/gourab-sinha/Machine_Learning/blob/master/Decision%20Tree/Project_Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Load Packages
from sklearn import datasets
import pandas as pd
import numpy as np
import math

In [0]:
# Decision Tree Class
class DecisionTree:
  classes = ['setosa', 'versicolor', 'virginica']
  
  # Information gain
  def __information_gain(self,X,feature):

    # Continous values
    X_new = X[feature].values
    X_new.sort()
    total = X.shape[0]
    class_with_count = dict(X['target'].value_counts().items())

    # Node Entropy
    level_entropy = 0
    for key,val in class_with_count.items():
      level_entropy += (val/total)*math.log2(val/total)
    
    level_entropy = -level_entropy
                      
    # Initialize required parameters
    max_info_gain = 0
    level_threshold = 0
    max_gain_ratio = 0

    for i in range(1,len(X_new)):
      # Define threshold 
      threshold = (X_new[i-1]+X_new[i])/2
      
      # Data points below or equal to threshold
      X_below_threshold = X[X[feature]<threshold]

      # Data points above threshold 
      X_above_threshold = X[X[feature]>=threshold]


      # # Data points below and above
      total_below = X_below_threshold.shape[0]
      total_above = X_above_threshold.shape[0]
      # # print(total_below,total_above)

      info_required_below = 0
      if total_below>0:
        # Indices of target values for below threshold
        Y_below = X_below_threshold['target']

        class_with_count = dict(Y_below.value_counts().items())

        info_required_below = 0
        # Information required below threshold
        for key,val in class_with_count.items():
          info_required_below += (val/total_below)*math.log2(val/total_below)
        
        info_required_below = -info_required_below
      
      info_required_above = 0
      if total_above>0:
        # Indices of target values for above threshold
        Y_above = X_above_threshold['target']

        class_with_count = dict(Y_above.value_counts().items())

        for key,val in class_with_count.items():
          info_required_above+= (val/total_above)*math.log2(val/total_above)

        info_required_above = -info_required_above

      
      
      # Information required
      info_require = (total_below/total)*info_required_below + (total_above/total)*info_required_above

      # # Current gain
      current_info_gain = level_entropy - info_require

      # Split Information
      current_split_info = 0 
      if total_below>0:
        current_split_info += (total_below/total)*math.log2(total_below/total)
      if total_above>0:
        current_split_info += (total_above/total)*math.log2(total_above/total)
      
      current_split_info = -current_split_info

      # Current Gain Ratio 
      current_gain_ratio = 0
      if current_split_info>0:
        current_gain_ratio = current_info_gain/current_split_info
      # Check with previous Max Gain Ratio
      if max_gain_ratio <= current_gain_ratio :
        level_threshold = threshold
        max_info_gain  = current_info_gain
        max_gain_ratio = current_gain_ratio

    
    return max_info_gain, max_gain_ratio, level_entropy, level_threshold




  # Decision Tree 
  def decision_tree(self,X, features,level,targets):  
    types = dict(X[targets].value_counts().items())
    # print(len(types),len(features))
    if len(features)==0 or len(types)==1:
      print("Level ",level)
      for key,val in types.items():
        print("Count of ",self.classes[key],val)
      
      entropy = 0
      total = X.shape[0]
      if total!=0:
        for key,val in types.items():
          entropy -= (val/total)*math.log2(val/total)
      
      print("Current Entropy is", entropy)
      print("Reached leaf Node")
      return 
    
    selected_feature = ""
    max_info_gain = 0
    level_gain_ratio = 0
    level_entropy = 0
    level_threshold = 0 # for continuous
    for feature in features:
      gain, gain_ratio, entropy, threshold = self.__information_gain(X,feature)
      if gain_ratio>=level_gain_ratio:
        selected_feature = feature
        max_gain = gain
        level_gain_ratio = gain_ratio 
        level_entropy = entropy
        level_threshold = threshold  

    
    count_with_classes = dict(X[targets].value_counts().items())
    print("Level ", level)
    for key,val in count_with_classes.items():
      print("Count of "+self.classes[key],val)
    print("Current Entropy is ",level_entropy)
    print("Splitting on feature "+str(selected_feature)+ " with gain ratio",level_gain_ratio)
    features.remove(selected_feature)


    X1 = X[X[selected_feature]<=level_threshold].copy()
    X2 = X[X[selected_feature]>level_threshold].copy()
    self.decision_tree(X1,features,level+1,targets)
    self.decision_tree(X2,features,level+1,targets)
    

In [259]:
# Load data
iris = datasets.load_iris()
X = pd.DataFrame(iris.data,columns = iris.feature_names)
X['target'] = iris.target
features = iris.feature_names

# Decision Tree object and function call
dc_tree = DecisionTree()
dc_tree.decision_tree(X,features,0,"target")

Level  0
Count of virginica 50
Count of versicolor 50
Count of setosa 50
Current Entropy is  1.584962500721156
Splitting on feature petal width (cm) with gain ratio 0.9999999999999999
Level  1
Count of setosa 50
Count of versicolor 7
Current Entropy is  0.5373760853377336
Splitting on feature petal length (cm) with gain ratio 1.0
Level  2
Count of  setosa 50
Current Entropy is 0.0
Reached leaf Node
Level  2
Count of  versicolor 7
Current Entropy is 0.0
Reached leaf Node
Level  1
Count of virginica 50
Count of versicolor 43
Current Entropy is  0.9959094138937685
Splitting on feature sepal length (cm) with gain ratio 0.9258200994482485
Level  2
Count of versicolor 43
Count of virginica 8
Current Entropy is  0.6267511370265895
Splitting on feature sepal width (cm) with gain ratio 0.42022930815923804
Level  3
Count of  versicolor 43
Count of  virginica 7
Current Entropy is 0.584238811642856
Reached leaf Node
Level  3
Count of  virginica 1
Current Entropy is 0.0
Reached leaf Node
Level  2
C