In [1]:
#https://towardsdatascience.com/decision-tree-from-scratch-in-python-46e99dfea775
import numpy as np
import pandas as pd

In [2]:
class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None
        
class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        for idx in range(self.n_features_):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class  

In [3]:
df = pd.read_csv("data.csv")
df.drop(["ID"],axis = 1, inplace = True)
df

Unnamed: 0,Age,Income,Gender,Marital Status,Buys
0,<21,High,Male,Single,No
1,<21,High,Male,Married,No
2,21-35,High,Male,Single,Yes
3,>35,Medium,Male,Single,Yes
4,>35,Low,Female,Single,Yes
5,>35,Low,Female,Married,No
6,21-35,Low,Female,Married,Yes
7,<21,Medium,Male,Single,No
8,<21,Low,Female,Married,Yes
9,>35,Medium,Female,Single,Yes


In [4]:
df = pd.concat([df,pd.get_dummies(df['Age'], prefix='Age')],axis=1).drop(['Age'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Income'], prefix='Income')],axis=1).drop(['Income'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Gender'], prefix='Gender')],axis=1).drop(['Gender'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Marital Status'], prefix='Marital Status')],axis=1).drop(['Marital Status'],axis=1)
#df = pd.concat([df,pd.get_dummies(df['Buys'], prefix='Buys')],axis=1).drop(['Buys'],axis=1)
df['Buys']=df['Buys'].replace("No",0)
df['Buys']=df['Buys'].replace("Yes",1)
df

Unnamed: 0,Buys,Age_21-35,Age_<21,Age_>35,Income_High,Income_Low,Income_Medium,Gender_Female,Gender_Male,Marital Status_Married,Marital Status_Single
0,0,0,1,0,1,0,0,0,1,0,1
1,0,0,1,0,1,0,0,0,1,1,0
2,1,1,0,0,1,0,0,0,1,0,1
3,1,0,0,1,0,0,1,0,1,0,1
4,1,0,0,1,0,1,0,1,0,0,1
5,0,0,0,1,0,1,0,1,0,1,0
6,1,1,0,0,0,1,0,1,0,1,0
7,0,0,1,0,0,0,1,0,1,0,1
8,1,0,1,0,0,1,0,1,0,1,0
9,1,0,0,1,0,0,1,1,0,0,1


In [5]:
data = df.values
#y = df.Buys
#x = df.drop('Buys',axis=1)
x = data[:, 1:] # all columns but the Buys
y = data[:, 0]  # expected to be from 0 to n_classes - 1
print(data[:,0])
# Fit data.
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(x, y)
#predict for '<21', 'Low', 'Female', 'Married'
y_pred = clf.predict([[0,1,0,0,1,0,1,0,1,0]])
#y_pred = clf.predict([[0,1,0,1,0,0,0,1,0,1]]) - not buy
print(y_pred)
if(y_pred[0]==0):
    print("Will not Buy")
else:
    print("Will Buy")


[0 0 1 1 1 0 1 0 1 1 1 1 1 0]
[1]
Will Buy
