Created on Wed Nov  14 20:51:59 2022

@author: Ipsit Sahoo

ASU Id: 1224872415


In [303]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [304]:
# Read data set
file_path = "data/car.data"
dataset_columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "sale_condition"]
df_data = pd.read_csv(file_path)

print("Data type: ", type(df_data), "Data Shape:", df_data.shape)

Data type:  <class 'pandas.core.frame.DataFrame'> Data Shape: (1727, 7)


In [305]:
# Head
columns = df_data.columns

res = dict()

for i in range(0, len(columns)):
    res[columns[i]] = dataset_columns[i]

df_data = df_data.rename(columns=res)

In [306]:
df_data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,sale_condition
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [307]:
df_data.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,sale_condition
count,1727,1727,1727,1727,1727,1727,1727
unique,4,4,4,3,3,3,4
top,high,high,3,4,med,med,unacc
freq,432,432,432,576,576,576,1209


# Label Data-points

In [308]:
le = preprocessing.LabelEncoder()

df_data_label_encoded = df_data.copy()

df_data_label_encoded.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety',
       'sale_condition'],
      dtype='object')

In [309]:
# Label Buying

df_data_label_encoded['buying'] = le.fit_transform(df_data['buying'])
df_data_label_encoded['maint'] = le.fit_transform(df_data['maint'])
df_data_label_encoded['doors'] = le.fit_transform(df_data['doors'])
df_data_label_encoded['persons'] = le.fit_transform(df_data['persons'])
df_data_label_encoded['lug_boot'] = le.fit_transform(df_data['lug_boot'])
df_data_label_encoded['safety'] = le.fit_transform(df_data['safety'])
df_data_label_encoded['sale_condition'] = le.fit_transform(df_data['sale_condition'])

df_data_label_encoded.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,sale_condition
0,3,3,0,0,2,2,2
1,3,3,0,0,2,0,2
2,3,3,0,0,1,1,2
3,3,3,0,0,1,2,2
4,3,3,0,0,1,0,2


In [310]:
df_data_label_encoded.values

array([[3, 3, 0, ..., 2, 2, 2],
       [3, 3, 0, ..., 2, 0, 2],
       [3, 3, 0, ..., 1, 1, 2],
       ...,
       [1, 1, 3, ..., 0, 1, 2],
       [1, 1, 3, ..., 0, 2, 1],
       [1, 1, 3, ..., 0, 0, 3]])

In [311]:
def AF_algorithm(D_values):
    af_set = []
    C_values = D_values[:, -1]
    AF_values = D_values[:, :-1]
    
    print(AF_values.shape, C_values.shape)
    
    for i in range(0, AF_values.shape[1]):
        print("Unique values: ", np.unique(AF_values[:, i]))
        af_set.append(
            np.sum(
                np.abs(AF_values[:, i] - C_values), axis=0) / len(np.unique(AF_values[:, i])))
    
    return af_set

def calculate_normalized_AF(AF_values):
    AF_normalized = []
    
    for i in range(AF_values.shape[0]):
        AF_normalized.append(AF_values[i] / np.sum(AF_values, axis=0))
    
    return AF_normalized
    
AF_values = np.array(AF_algorithm(df_data_label_encoded.values))
print(AF_values.shape)
normalized_AF_values = calculate_normalized_AF(AF_values) 

print(normalized_AF_values)

(1727, 6) (1727,)
Unique values:  [0 1 2 3]
Unique values:  [0 1 2 3]
Unique values:  [0 1 2 3]
Unique values:  [0 1 2]
Unique values:  [0 1 2]
Unique values:  [0 1 2]
(6,)
[0.1445828331332533, 0.1450330132052821, 0.15006002400960383, 0.21608643457382953, 0.16886754701880752, 0.1753701480592237]


In [312]:
# Normalization of the AF relation



## Prepare training and testing set

In [313]:
X = df_data.drop(columns=["sale_condition"])
X.head()

X = X.values

In [314]:
Y = df_data["sale_condition"]
Y = Y.values.reshape(-1, 1)
Y.shape

(1727, 1)

In [315]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("X_train shape : ", X_train.shape)
print("Y_train shape : ", Y_train.shape)
print("X_test shape : ", X_test.shape)
print("Y_test shape : ", Y_test.shape)

X_train shape :  (1381, 6)
Y_train shape :  (1381, 1)
X_test shape :  (346, 6)
Y_test shape :  (346, 1)


## Node Class

In [316]:
class Node():
    def __init__(
        self,
        feature_index=None,
        threshold=None,
        left=None,
        right=None,
        info_gain=None,
        value=None,
        normalized_cr = None):
        '''Constructor for the DT Nodes'''
        
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.normalized_correlation = normalized_cr
        
        # for leaf node
        self.value = value

## Tree Class

In [317]:
class DecisionTreeClassifier():
    def __init__(
        self,
        min_samples_split=2,
        max_depth=2,
        normalized_correlation_values=None,
        information_gain_mode="entropy"):
        self.root = None
        
        #Base Case
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.normalized_correlation_values = normalized_correlation_values
        self.information_gain_mode = information_gain_mode
        
    def build_tree(self, dataset, curr_depth=0):
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        # print("Number of samples:", num_samples, "Number of features:", num_features)
        
        if num_samples >=  self.min_samples_split and curr_depth <= self.max_depth:
            
            # Find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            
            if best_split["info_gain"] > 0:
                # Recurrence in the left SubTree
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth + 1)
                
                # Recurrence in right subtree
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth + 1)
                
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
            
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        
        best_split = {}
        max_info_gain = -float("inf")
        # print("Dataset shape:", dataset.shape)
        for feature_index in range(num_features):
            # print(feature_index)
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            
            # Loop over all feature values in the dat
            for threshold in possible_thresholds:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    Y, Y_left, Y_right = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    
                    # Compute Information Gain using X's and Y's
                    if self.information_gain_mode == "entropy_af":
                        curr_info_gain = self.information_gain_af(Y, Y_left, Y_right, feature_index)
                    elif self.information_gain_mode == "entropy_chi":
                        chi_square_left_node = self.information_gain_chi_squared(Y, Y_left)
                        chi_square_right_node = self.information_gain_chi_squared(Y, Y_right)
                        
                        curr_info_gain = chi_square_left_node if chi_square_left_node > chi_square_right_node else chi_square_right_node
                    else:
                        curr_info_gain = self.information_gain(Y, Y_left, Y_right)
                    
                    # print("Current Info Gain", curr_info_gain)
                    
                    # Update the best split if needed
                    if curr_info_gain > max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        # print("Feature Index:", feature_index, "Threshold:", threshold)
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        
        # print("Y:", Y_left.shape, Y_right.shape)
        return dataset_left, dataset_right
    
    def information_gain_af(self, parent, l_child, r_child, f_index):
        p_l = len(l_child) / len(parent)
        p_r = len(r_child) / len(parent)
        
        return self.entropy(parent) - p_l * self.entropy(l_child) * self.normalized_correlation_values[f_index] - p_r * self.entropy(r_child) * self.normalized_correlation_values[f_index]
    
    def information_gain(self, parent, l_child, r_child):
        p_l = len(l_child) / len(parent)
        p_r = len(r_child) / len(parent)

        return self.entropy(parent) - p_l * self.entropy(l_child) - p_r * self.entropy(r_child)
    
    def information_gain_chi_squared(self, parent, child):
        unique_parent, counts_parent = np.unique(parent, return_counts=True)
        total_parent = np.sum(counts_parent)
        
        expected = np.mean([c / total_parent for c in counts_parent])
        # print("Expected: ", expected)
        
        unique_child, counts_child = np.unique(child, return_counts=True)
        total_child = sum(counts_child)
        
        actual = np.mean([c / total_child for c in counts_child])
        
        # print("Actual: ", actual)
        
        return np.sqrt((expected - actual) ** 2 / expected)
        
        
        
    def entropy(self, Y):
        class_labels = np.unique(Y)
        entropy = 0
        
        for cls in class_labels:
            p_cls = len(Y[Y == cls]) / len(Y)
            entropy += -p_cls * np.log2(p_cls)
            
        return entropy
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def print_tree(self, tree=None, indent=" "):
        
        if not tree:
            tree = self.root
            
        if tree.value is not None:
            print(tree.value)
            
        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
            
    def predict(self, X):
        ''' function to predict new dataset '''
        predictions = []
        
        for x in X:
            # print(x)
            predictions.append(
                self.make_prediction(x, self.root))
            
        return predictions
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        dataset = np.concatenate((X, Y), axis=1)
        
        self.root = self.build_tree(dataset)
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value != None:
            return tree.value
        # print(x)
        feature_val = x[tree.feature_index]
        
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

## Running Decision Tree on Normal Information Gain with Chi-squared function

In [318]:
classifier_chi2 = DecisionTreeClassifier(
    min_samples_split=5,
    max_depth=5,
    information_gain_mode="entropy_chi")

classifier_chi2.fit(X_train,Y_train)
classifier_chi2.print_tree()

X_3 <= 2 ? 1.5
 left:unacc
 right:X_0 <= high ? 0.5
  left:X_1 <= med ? 0.7071067811865476
    left:acc
    right:unacc
  right:X_0 <= med ? 0.5
    left:X_1 <= med ? 0.5
        left:X_1 <= high ? 0.16666666666666663
                left:X_0 <= low ? 0.2886751345948129
                                left:acc
                                right:unacc
                right:X_4 <= med ? 0.16666666666666663
                                left:unacc
                                right:unacc
        right:unacc
    right:X_1 <= high ? 0.7071067811865476
        left:unacc
        right:X_1 <= med ? 0.7071067811865476
                left:acc
                right:unacc


In [319]:
Y_pred = classifier_chi2.predict(X_test)

In [320]:
accuracy_score(Y_test, Y_pred)

0.6820809248554913

## Running Decision Tree on Normal Information Gain with Entropy function

In [321]:
classifier = DecisionTreeClassifier(
    min_samples_split=5,
    max_depth=5,
    information_gain_mode="entropy")

classifier.fit(X_train,Y_train)
classifier.print_tree()

X_3 <= 2 ? 0.21444533317462178
 left:unacc
 right:X_5 <= high ? 0.2082948718612473
  left:X_0 <= med ? 0.16803030425623927
    left:X_0 <= high ? 0.2715517177987672
        left:X_1 <= med ? 0.6268051352479396
                left:X_2 <= 2 ? 0.09478254050054474
                                left:acc
                                right:acc
                right:unacc
        right:X_4 <= med ? 0.4064816845876311
                left:X_1 <= med ? 0.3967568485930254
                                left:vgood
                                right:acc
                right:X_2 <= 2 ? 0.3577824752084572
                                left:unacc
                                right:acc
    right:X_1 <= high ? 0.3201008967002281
        left:unacc
        right:X_1 <= med ? 0.7173155132713623
                left:X_2 <= 2 ? 0.09139023062145002
                                left:acc
                                right:acc
                right:unacc
  right:X_5 <= low ? 0.361694413567

In [322]:
Y_pred = classifier.predict(X_test)

In [323]:
accuracy_score(Y_test, Y_pred)

0.869942196531792

## Running Decision Tree on Normal Information Gain with Entropy function with Correlation Function

In [324]:
classifier_af = DecisionTreeClassifier(
    min_samples_split=5,
    max_depth=5,
    normalized_correlation_values=normalized_AF_values,
    information_gain_mode="entropy_af")

classifier_af.fit(X_train,Y_train)
classifier_af.print_tree()

X_1 <= med ? 1.0239412477617063
 left:X_0 <= high ? 1.140071605998169
  left:X_1 <= low ? 0.7933024553768427
    left:X_5 <= high ? 0.7994503252960464
        left:X_3 <= 2 ? 0.8829064196363225
                left:unacc
                right:X_2 <= 2 ? 0.24513109762833554
                                left:acc
                                right:acc
        right:X_5 <= low ? 0.6022220281225843
                left:unacc
                right:X_4 <= med ? 0.8380847578297481
                                left:acc
                                right:unacc
    right:X_5 <= high ? 0.7807444845960463
        left:X_3 <= 2 ? 0.8866832830233871
                left:unacc
                right:X_2 <= 2 ? 0.2571342430561564
                                left:acc
                                right:acc
        right:X_5 <= low ? 0.6076682816825143
                left:unacc
                right:X_4 <= med ? 0.851957724698707
                                left:acc
                

In [325]:
Y_pred = classifier_af.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.7630057803468208