# Decision Trees

In [21]:
import pandas as pd
import numpy as np

### Gini Index



In [50]:
class DecisionTree:

    def __init__(self):
        # Initialize constructor with the following objects:
        self.max_depth = 100
        self.tree = None # To store the fitted decision tree

    
    def gini_index(self, y):
        """
        Compute the gini index of a given set of labels.
        Parameters:
            y: array-like.
                The response variable that you are trying to predict
        Returns
            float:
                The Gini Index for a given y.
        """
        gini = 1
        classes = np.unique(y)
        for c in classes:
            pi_c =  len(y[y==c])/len(y)
            gini -= pi_c**2

        return gini

    def find_feature_type(self, feature):
        if feature.dtype == 'object' or (feature.dtype == np.int_ and len(np.unique(feature)) < 5):
            return 'categorical'
        else:
            return 'continuous'

    def find_split(self, X, y):
        """
        Find the best feature-threshold to split a given node.

        Parameters:
            X: array-like
                Set of explanatory features
            y: array-like
                Response variable.

        Returns:
            selected_features: int
                The feature that provides the best split (that is, that 
                minimizes the Gini Index)
            selected_threshold: float
                The thresgold that provides the best split
        """

        n_features = X.shape[1]
        gini = float('inf')
        selected_feature = None
        selected_threshold = None

        for j in range(0, n_features):
            feature = X[:,j]
            thresholds = np.sort(np.unique(feature), axis = 0)  #sort:
            
            feature_type = self.find_feature_type(feature)

            for t in thresholds:

                if feature_type == "continuous":
                    y_left  = y[ feature  < t ]
                    y_right = y[ feature >= t ]
                elif feature_type =="categorical":
                    y_left  = y[ feature == t ]
                    y_right = y[ feature != t ]
                # Continue to next threshold if one of the splits is empty
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                weighted_gini = ( self.gini_index(y=y_left)*len(y_left) +
                                  self.gini_index(y=y_right)*len(y_right) )/len(y)

                if weighted_gini < gini:
                    gini = weighted_gini
                    selected_feature = j 
                    selected_threshold = t

        return selected_feature, selected_threshold


    def make_split(self, X, y, selected_feature, selected_threshold):
        """ 
        Split the data into left and right branches

        Returns 
        
        """

        left_mask = X[:,selected_feature] < selected_threshold
        y_left, X_left = y[left_mask], X[left_mask,:]
        y_right, X_right = y[~left_mask], X[~left_mask,:]

        return X_left, y_left, X_right, y_right


    def fit(self, X, y, max_depth=100, depth=0):
        X = np.array(X)
        y = np.array(y)

        if depth == max_depth or len(y) < 2:
            unique_classes, counts = np.unique(y, return_counts=True)
            return unique_classes[np.argmax(counts)]
        else: 

            # Find best [feature, threshold] for this node
            selected_feature, selected_threshold = self.find_split(X, y)

            # Split the data
            X_left, y_left, X_right, y_right = self.make_split(X, y,
                                            selected_feature, selected_threshold)
               # If one of the splits is empty, return the most common class of the current node
            if len(y_left) == 0 or len(y_right) == 0:
                unique_classes, counts = np.unique(y, return_counts=True)
                return unique_classes[np.argmax(counts)]
    
        #RECURSION
            left_subtree = self.fit(X_left,   y_left,  depth + 1)
            right_subtree = self.fit(X_right, y_right, depth + 1)
        # Return the current node with the left and right subtrees
        
         # Return the current node with the left and right subtrees
        return {
            'feature': selected_feature,
            'threshold': selected_threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def predict(self,X_test):
        """
            Predict y class given the observations for the set of explanatory 
            featyres X
            Parameters:
                X_test: array-like
                    New observations of the explanatory features
                Returns
                y_pred: array-like
                    Predictions for y
        """
        X = np.array(X_test)






In [65]:
df = pd.read_csv('mock_decision_tree_dataset.csv')
tree = DecisionTree()
tree.fit(X = df.drop('y', axis =1), y=df['y'])

'class_2'