<a href="https://colab.research.google.com/github/farahelmashad/ML_Models/blob/main/DescisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


In [12]:
class Node:
    def __init__(self, feature=None,threshold=None,left=None,right=None,*,value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    def is_leaf_node(self):
      return self.value is not None

class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100,n_features=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.root = None

    def fit(self,X,y):
      self.n_features=X.shape[1] if not self.n_features else min(X.shape[1],self.n_features)
      self.root=self._grow_tree(X,y)

    def _grow_tree(self,X,y,depth=0):
      #checking the stopping criteria
      n_samples,n_feats=X.shape
      n_labels=len(np.unique(y))
      if (depth>=self.max_depth or n_labels==1 or n_samples< self.min_samples_split):
        leaf_value=self._most_common_label(y)
        return Node(value=leaf_value)

      #find the best split:
      features=np.random.choice(n_feats, self.n_features, replace=False)
      best_threshold,best_feature=self._best_split(X,y,features)

      # create child nodes:
      left_indices, right_indices=self._split(X[:, best_feature],best_threshold)
      left=self._grow_tree(X[left_indices,:],y[left_indices],depth+1)
      right=self._grow_tree(X[right_indices,:],y[right_indices],depth+1)
      return Node(best_feature, best_threshold, left, right)




    def _best_split(self,X,y,features):
      best_ig=-float('inf')
      best_feature,best_threshold=None,None
      for feature in features:
        X_column=X[:, feature]
        thresholds=np.unique(X_column)

        for threshold in thresholds:
          ig=self._information_gain(y,X_column, threshold)
          if ig>best_ig:
            best_ig=ig
            best_feature=feature
            best_threshold=threshold

      return best_threshold, best_feature

    def _information_gain(self,y,X_column, threshold):

      parent_entropy=self._entropy(y)
      left_indices, right_indices=self._split(X_column, threshold)
      if len(left_indices)==0 or len(right_indices)==0:
        return 0.0 # no information gain, useless split, entropy one of the children will be exaclty the same as the parent and the other will be 0 , ig=0
      left_entropy=self._entropy(y[left_indices])
      right_entropy=self._entropy(y[right_indices])
      weighted_entropy=((len(left_indices)/len(y))*left_entropy)+((len(right_indices)/len(y))*right_entropy)
      information_gain=parent_entropy-weighted_entropy
      return information_gain









    def _split(self,X_column, threshold):
      left_indices=np.argwhere(X_column<=threshold).flatten()
      right_indices=np.argwhere(X_column>threshold).flatten()
      return left_indices, right_indices
    def _entropy(self,y):
      _, counts= np.unique(y, return_counts=True)
      probs=counts/counts.sum()
      entropies=-probs*np.log2(probs)
      return entropies.sum()



    def _most_common_label(self,y):
      counter=Counter(y)
      return counter.most_common(1)[0][0]

    def predict(self, X):
     return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
      if node.is_leaf_node():
        return node.value
      if x[node.feature]<=node.threshold:
        return self._traverse_tree(x, node.left)
      else:
        return self._traverse_tree(x,node.right)









In [13]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data=datasets.load_iris()
X,y=data.data,data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1234)

clf=DecisionTree()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred)
print(acc)

0.9666666666666667
