<a href="https://colab.research.google.com/github/gourab-sinha/Machine_Learning/blob/master/Decision%20Tree/Project_Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Load Packages
from sklearn import datasets
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

# Iris Dataset 
#### Iris Dataset has high impurity that means our entropy for a node can go more than 1 which is fine and as it not the binary classifier and only four feature present so some cases my Decision tree may not predict more accurate classes.

In [0]:
# Decision Tree Class
class DecisionTree:
  # Types of classes present in iris dataset
  classes = ['setosa', 'versicolor', 'virginica']
  def __get_class(self,store_detail,X,features):
    # Convert Numpy array to pandas DataFrame ease of access.
    X = pd.DataFrame([X],columns=features)

    # Dynamic list to get the threshold values depending on conditions
    store_detail_new = store_detail

    # If type is not dict that means we reach to leaf node
    while type(store_detail_new[0])==dict:

      # Get the threshold value from the dictionary present at 0th index of store_detail_new
      feature_threshold = store_detail_new[0]

      # Access key val pair 
      for feature,threshold in feature_threshold.items():

        # if thresold value is greater then data point value then go leftsubtree otherwise rightsubtree or if only one direction exists
        if X[feature].iloc[0]<=threshold or len(store_detail_new)<=2:
          store_detail_new = store_detail_new[1]
        else:
          store_detail_new = store_detail_new[2]
        break
    
    return store_detail_new[0]




  def predict_class(self,store_detail,X,features):

    # Store prediction
    classified_result = []

    # Iterate over each data point
    for i in X:
      class_type = self.__get_class(store_detail,i,features)
      classified_result.append(class_type)

    return classified_result

  # Information gain
  def __information_gain(self,X,feature):

    # Continous values
    X_new = X[feature].values
    X_new.sort()
    total = X.shape[0]
    class_with_count = dict(X['target'].value_counts().items())

    # Node Entropy
    level_entropy = 0
    for key,val in class_with_count.items():
      level_entropy -= (val/total)*math.log2(val/total)
    
                      
    # Initialize required parameters
    max_info_gain = 0
    level_threshold = 0
    max_gain_ratio = 0

    for i in range(1,len(X_new)):
      # Define threshold 
      threshold = (X_new[i-1]+X_new[i])/2
      
      # Data points below or equal to threshold
      X_below_threshold = X[X[feature]<threshold]

      # Data points above threshold 
      X_above_threshold = X[X[feature]>=threshold]

      # Data points below and above
      total_below = X_below_threshold.shape[0]
      total_above = X_above_threshold.shape[0]

      info_required_below = 0
      if total_below>0:
        # Target values for below threshold
        Y_below = X_below_threshold['target']

        class_with_count = dict(Y_below.value_counts().items())

        info_required_below = 0
        # Information required below threshold
        for key,val in class_with_count.items():
          info_required_below += (val/total_below)*math.log2(val/total_below)
        
        info_required_below = -info_required_below
      
      info_required_above = 0
      if total_above>0:
        # Target values for above threshold
        Y_above = X_above_threshold['target']

        class_with_count = dict(Y_above.value_counts().items())

        for key,val in class_with_count.items():
          info_required_above+= (val/total_above)*math.log2(val/total_above)

        info_required_above = -info_required_above

      
      
      # Information required
      info_require = (total_below/total)*info_required_below + (total_above/total)*info_required_above

      # # Current gain
      current_info_gain = level_entropy - info_require

      # Split Information
      current_split_info = 0 
      if total_below>0:
        current_split_info += (total_below/total)*math.log2(total_below/total)
      if total_above>0:
        current_split_info += (total_above/total)*math.log2(total_above/total)
      
      current_split_info = -current_split_info

      # Current Gain Ratio 
      current_gain_ratio = 0
      if current_split_info>0:
        current_gain_ratio = current_info_gain/current_split_info
      # Check with previous Max Gain Ratio
      if max_gain_ratio <= current_gain_ratio :
        level_threshold = threshold
        max_info_gain  = current_info_gain
        max_gain_ratio = current_gain_ratio

    
    return max_info_gain, max_gain_ratio, level_entropy, level_threshold




  # Decision Tree 
  def decision_tree(self,X, features,level,targets):
    # Maintain details of threshold and selected feature to split upon
    store_info = []  

    types = dict(X[targets].value_counts().items())
    # print(len(types),len(features))
    if len(features)==0 or len(types)==1:
      print("Level ",level)
      for key,val in types.items():
        print("Count of ",self.classes[key],val)
      
      entropy = 0
      total = X.shape[0]
      majority = 0
      result_class = ""
      if total!=0:
        for key,val in types.items():
          entropy -= (val/total)*math.log2(val/total)
          if val>majority:
            majority = val
            result_class = key
      
      print("Current Entropy is", entropy)
      print("Reached leaf Node")
      return [result_class]
    
    # Parameters initialize with default values
    selected_feature = ""
    max_info_gain = 0
    level_gain_ratio = 0
    level_entropy = 0
    level_threshold = 0

    # Iterate over each feature and pick best one 
    for feature in features:
      gain, gain_ratio, entropy, threshold = self.__information_gain(X,feature)
      if gain_ratio>=level_gain_ratio:
        selected_feature = feature
        max_gain = gain
        level_gain_ratio = gain_ratio 
        level_entropy = entropy
        level_threshold = threshold  

    # Information frequency with target value. 
    count_with_classes = dict(X[targets].value_counts().items())
    print("Level ", level)
    for key,val in count_with_classes.items():
      print("Count of "+self.classes[key],val)
    print("Current Entropy is ",level_entropy)
    print("Splitting on feature "+str(selected_feature)+ " with gain ratio",level_gain_ratio)

    # Copy of feature to avoid reference issue
    # Removing may cause removing from actual features which may cause problem to other recursion call
    new_features = features.copy()
    new_features.remove(selected_feature)

    store_info.append({selected_feature:level_threshold})
    X1 = X[X[selected_feature]<=level_threshold].copy()
    X2 = X[X[selected_feature]>level_threshold].copy()

    # If there exists at least one point which goes in leftsubtree then store the returned result
    if(X1.shape[0]>0):
      print()
      store_info.append(self.decision_tree(X1,new_features,level+1,targets))

    # If there exists at least one point which goes in rightsubtree then store the returned result
    if X2.shape[0]>0:
      print()
      store_info.append(self.decision_tree(X2,new_features,level+1,targets))
    return store_info 

In [88]:
# Load data
iris = datasets.load_iris()
X = pd.DataFrame(iris.data,columns = iris.feature_names)

# Added target for future use
X['target'] = iris.target
features = iris.feature_names

# Target values
Y = pd.DataFrame(iris.target,columns=['Target'])
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.20)

# Decision Tree object and function call
dc_tree = DecisionTree()
store_detail = dc_tree.decision_tree(X_train,features,0,"target")

Level  0
Count of virginica 42
Count of versicolor 40
Count of setosa 38
Current Entropy is  1.583759753416363
Splitting on feature petal length (cm) with gain ratio 0.20040451915865418

Level  1
Count of virginica 42
Count of versicolor 40
Count of setosa 37
Current Entropy is  1.5830175105308546
Splitting on feature sepal width (cm) with gain ratio 0.2039216215090233

Level  2
Count of setosa 2
Count of virginica 1
Count of versicolor 1
Current Entropy is  1.5
Splitting on feature sepal length (cm) with gain ratio 0.5

Level  3
Count of virginica 1
Count of setosa 1
Current Entropy is  1.0
Splitting on feature petal width (cm) with gain ratio 0

Level  4
Count of  virginica 1
Count of  setosa 1
Current Entropy is 1.0
Reached leaf Node

Level  3
Count of versicolor 1
Count of setosa 1
Current Entropy is  1.0
Splitting on feature petal width (cm) with gain ratio 0

Level  4
Count of  versicolor 1
Count of  setosa 1
Current Entropy is 1.0
Reached leaf Node

Level  2
Count of virginica 4

In [0]:
# Features
iris = datasets.load_iris()
features = iris.feature_names

# Drop target column as predict function doesn't require
X_test.drop(['target'],axis = 1, inplace=True)

# Predict class
Predicted_class = dc_tree.predict_class(store_detail,X_test.values,features)

In [0]:
Y_test = Y_test.values
for i in range(len(Y_test)):
  print(Predicted_class[i],Y_test[i])