In [1]:
import numpy as np

In [2]:
# Import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# Import confusion matrix, accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score

def cross_validate(model, X, y, cv=10):

    # Let's split data into 10 folds with stratisfied sampling
    kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=0)

    # Let's create a dictionary to store the scores
    scores = {}
    
    # Let's iterate over the folds
    i = 1
    for train_index, test_index in kf.split(X, y):
        
        # Let's split the data into train and test
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Let's fit the model
        model.fit(X_train.values, y_train.values)
        
        # Let's predict the test data
        y_pred = model.predict(X_test.values)
        
        # Let's calculate the accuracy score
        acc = accuracy_score(y_test, y_pred)

        # Let's calculate the confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        
        # Let's append the score to the list
        scores[f'Fold-{i}'] = {'accuracy': acc, 'confusion_matrix': cm}
        i += 1

    # Let's return the scores
    return scores

In [3]:
# This class is built for datasets with Ordinal or Numerical features
class DecisionTreeClassification:
    
    def __init__(self, max_depth=5, min_size=10, metric='gini'):
        self.max_depth = max_depth
        self.min_size = min_size
        self.root = None # No Tree Yet
        self.metric = metric

    # Split a dataset based on an attribute and an attribute value
    def test_split(self, index, value, dataset):
        left, right = list(), list()
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right
    
    # Calculate the Entrpy for a split dataset
    def entropy(self, groups, classes):
        # count all samples at split point
        n_instances = float(sum([len(group) for group in groups]))
        # sum weighted Entropy for each group
        entropy = 0.0
        for group in groups:
            size = float(len(group))
            # avoid divide by zero
            if size == 0:
                continue
            score = 0.0
            # score the group based on the score for each class
            for class_val in classes:
                p = [row[-1] for row in group].count(class_val) / size
                if p != 0:
                    score += p * np.log2(p)
            # weight the group score by its relative size
            entropy += (-1 * score) * (size / n_instances)
        return entropy

    # Calculate the Gini index for a split dataset
    def gini_index(self, groups, classes):
        # count all samples at split point
        n_instances = float(sum([len(group) for group in groups]))
        # sum weighted Gini index for each group
        gini = 0.0
        for group in groups:
            size = float(len(group))
            # avoid divide by zero
            if size == 0:
                continue
            score = 0.0
            # score the group based on the score for each class
            for class_val in classes:
                p = [row[-1] for row in group].count(class_val) / size
                score += p * p
            # weight the group score by its relative size
            gini += (1.0 - score) * (size / n_instances)
        return gini

    def get_split(self, dataset):

        class_values = list(set(row[-1] for row in dataset))

        b_index, b_value, b_score, b_groups = 999, 999, 999, None

        for index in range(len(dataset[0])-1):
            for row in dataset:

                groups = self.test_split(index, row[index], dataset)

                if self.metric == 'gini':
                    score = self.gini_index(groups, class_values)
                elif self.metric == 'entropy':
                    score = self.entropy(groups, class_values)
                else:
                    raise Exception('Invalid Metric Error')

                if score < b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], score, groups

        return {'index':b_index, 'value':b_value, 'groups':b_groups}

    # Create a terminal node value
    def to_terminal(self, group):
        outcomes = [row[-1] for row in group]
        # Here, we will take the maximum element as the classifcation output
        return max(set(outcomes), key=outcomes.count)


    # Create child splits for a node or make terminal
    def split(self, node, max_depth, min_size, depth):
        left, right = node['groups']
        del(node['groups'])
        # check for a no split
        if not left or not right:
            node['left'] = node['right'] = self.to_terminal(left + right)
            return
        # check for max depth
        if depth >= max_depth:
            node['left'], node['right'] = self.to_terminal(left), self.to_terminal(right)
            return
        # process left child
        if len(left) <= min_size:
            node['left'] = self.to_terminal(left)
        else:
            node['left'] = self.get_split(left)
            self.split(node['left'], max_depth, min_size, depth+1)
        # process right child
        if len(right) <= min_size:
            node['right'] = self.to_terminal(right)
        else:
            node['right'] = self.get_split(right)
            self.split(node['right'], max_depth, min_size, depth+1)
    
    def fit(self, X, y):
        
        # Assert that X and y have must be numpy arrays
        assert type(X) == np.ndarray, "X must be a numpy array"
        assert type(y) == np.ndarray, "y must be a numpy array"

        # Raise Error if invalid shape
        if X.shape[0] != y.shape[0]:
            raise Exception('Number of rows in X and y must be equal')

        # Concatenate X and y
        train = np.concatenate((X, y.reshape(-1, 1)), axis=1)

        # Convert 2d Array train to list
        train = train.tolist()

        root = self.get_split(train)

        self.split(root, self.max_depth, self.min_size, 1)

        self.root = root.copy()

        return self
    
    def predict_one(self, row, node=None):

        # Root Node
        if node is None:
            node = self.root

        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self.predict_one(row, node['left'])
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self.predict_one(row, node['right'])
            else:
                return node['right']
    
    def predict(self, X):
        output = []
        for x in X:
            output.append(self.predict_one(x))
        return np.array(output)

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('./training_data.csv')
data.head()

Unnamed: 0,Buying_Cost,Maintainance_Cost,Number_of_doors,Number_of_Passenger,Luggage_Space,Safety_Features,How_is_the_deal
0,vhigh,med,2,4,small,low,Bad_deal
1,vhigh,med,5more,4,small,low,Bad_deal
2,med,vhigh,5more,4,small,low,Bad_deal
3,high,high,3,2,med,med,Bad_deal
4,vhigh,vhigh,5more,4,small,med,Bad_deal


In [6]:
#  split training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2, random_state=0, stratify = data.iloc[:, -1])

# Encoding the categorical data

In [7]:
dict1 = {'vhigh': 3, 'high': 2, 'med': 1, 'low': 0}
X_train["Buying_Cost"].replace(dict1, inplace=True)
X_train["Maintainance_Cost"].replace(dict1, inplace=True)
X_test["Buying_Cost"].replace(dict1, inplace=True)
X_test["Maintainance_Cost"].replace(dict1, inplace=True)

In [8]:
dict2 = {'2' : 0, '3' : 1, '4' : 2, '5more' : 3}
X_train["Number_of_doors"].replace(dict2, inplace=True)
X_test["Number_of_doors"].replace(dict2, inplace=True)

In [9]:
dict3 = {'2' : 0, '4' : 1, 'more' : 2}
X_train["Number_of_Passenger"].replace(dict3, inplace=True)
X_test["Number_of_Passenger"].replace(dict3, inplace=True)

In [10]:
dict4 = {'small' : 0, 'med' : 1, 'big' : 2}
X_train["Luggage_Space"].replace(dict4, inplace=True)
X_test["Luggage_Space"].replace(dict4, inplace=True)

In [11]:
dict5 = {'low' : 0, 'med' : 1, 'high' : 2}
X_train["Safety_Features"].replace(dict5, inplace=True)
X_test["Safety_Features"].replace(dict5, inplace=True)

In [12]:
X_train.head()

Unnamed: 0,Buying_Cost,Maintainance_Cost,Number_of_doors,Number_of_Passenger,Luggage_Space,Safety_Features
453,0,3,1,2,1,0
689,3,0,1,0,0,0
522,3,1,2,1,0,2
124,0,3,2,2,0,1
1478,3,3,0,1,2,0


In [13]:
# reset index
X_train.reset_index(drop=True, inplace=True)

In [14]:
X_train.head()

Unnamed: 0,Buying_Cost,Maintainance_Cost,Number_of_doors,Number_of_Passenger,Luggage_Space,Safety_Features
0,0,3,1,2,1,0
1,3,0,1,0,0,0
2,3,1,2,1,0,2
3,0,3,2,2,0,1
4,3,3,0,1,2,0


In [15]:
y_train

453     Bad_deal
689     Bad_deal
522     Bad_deal
124     Bad_deal
1478    Bad_deal
          ...   
761     Bad_deal
1311    Bad_deal
588     Bad_deal
862     Bad_deal
1216    Bad_deal
Name: How_is_the_deal, Length: 1243, dtype: object

In [16]:
dict6 = {'Nice_deal' : 1, 'Bad_deal' : 0}
y_train.replace(dict6, inplace=True)
y_test.replace(dict6, inplace=True)

In [17]:
y_train.head()

453     0
689     0
522     0
124     0
1478    0
Name: How_is_the_deal, dtype: int64

In [18]:
model = DecisionTreeClassification(max_depth=5, min_size=10, metric='entropy')

In [19]:
# Cross Validation Accuracy
scores = cross_validate(model, X_train, y_train, cv=5)

In [20]:
avg_accuracy = 0

for fold in scores:
    print(fold)
    accuracy = scores[fold]['accuracy']
    avg_accuracy += accuracy
    print('Accuracy: %.3f%%' % (accuracy*100))
    cm = scores[fold]['confusion_matrix']
    print('Confusion Matrix: \n', cm)
    print()

avg_accuracy /= len(scores.keys())

Fold-1
Accuracy: 92.369%
Confusion Matrix: 
 [[215  15]
 [  4  15]]

Fold-2
Accuracy: 97.992%
Confusion Matrix: 
 [[227   3]
 [  2  17]]

Fold-3
Accuracy: 97.189%
Confusion Matrix: 
 [[227   2]
 [  5  15]]

Fold-4
Accuracy: 96.371%
Confusion Matrix: 
 [[229   0]
 [  9  10]]

Fold-5
Accuracy: 96.371%
Confusion Matrix: 
 [[229   0]
 [  9  10]]



In [21]:
print('Average Accuracy: %.3f%%' % (avg_accuracy*100))

Average Accuracy: 96.058%
