In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import export_graphviz
import graphviz



In [3]:


data = pd.read_csv('encoded_data.csv')
print(data.head())
X = data.drop(columns=['X2', 'X16', 'X17', 'X18'])
Y = data['X18']
print(X.head())
print(Y.head())


   X2      X16  X17  X18  X4_Male  X7_Everyday  X8_Everyday  X8_Often  \
0  20  Nothing  Yes   No     True         True        False      True   
1  69     nope  Yes  Yes     True         True         True     False   

   X9_No change  X9_Weight gain  ...  X12_Everyday  X12_Sometimes  \
0         False            True  ...          True          False   
1          True           False  ...         False           True   

   X13_Everyday  X14_Everyday  X14_Rarely  X15_No change  X3_Bhubaneswar  \
0          True         False        True           True            True   
1          True          True       False           True           False   

   X3_Liquid  X5_Bachelor's 3rd Year  X6_Computer Science  
0      False                    True                 True  
1       True                    True                 True  

[2 rows x 23 columns]
   X4_Male  X7_Everyday  X8_Everyday  X8_Often  X9_No change  X9_Weight gain  \
0     True         True        False      True         False

In [6]:

class TreeNode:
    def __init__(self, feature=None, left=None, right=None, value=None):
        self.feature = feature  # Feature to split on
        self.left = left  # Left child node
        self.right = right  # Right child node
        self.value = value  # Prediction value (for leaf nodes)

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth  

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        # Stopping criteria
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return TreeNode(value=y.iloc[0])

        num_features = X.shape[1]
        best_feature = None
        best_info_gain = -np.inf

        entropy_parent = self._entropy(y)

        for feature in X.columns:
            info_gain = self._information_gain(X[feature], y, entropy_parent)
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature = feature

        if best_info_gain == -np.inf:
            return TreeNode(value=y.iloc[0])

        left_indices = X[best_feature]
        right_indices = ~X[best_feature]

        left = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return TreeNode(feature=best_feature, left=left, right=right)

    def _entropy(self, y):
        class_probabilities = y.value_counts(normalize=True)
        entropy = -np.sum(class_probabilities * np.log2(class_probabilities))
        return entropy

    def _information_gain(self, feature, y, entropy_parent):
        entropy_children = 0
        for value in feature.unique():
            subset_y = y[feature == value]
            entropy_children += len(subset_y) / len(y) * self._entropy(subset_y)
        information_gain = entropy_parent - entropy_children
        return information_gain

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for _, x in X.iterrows()])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature]:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

    def export_tree_graphviz(self, feature_names, class_names, file_path):
        dot_data = self._build_dot_data(feature_names, class_names)
        with open(file_path, 'w') as f:
            f.write(dot_data)

    def _build_dot_data(self, feature_names, class_names):
        dot_data = "digraph decision_tree {\n"
        dot_data += 'node [fontname="Arial"];\n'
        self._traverse_tree_for_dot(self.tree, dot_data)
        dot_data += "}\n"
        return dot_data

    def _traverse_tree_for_dot(self, node, dot_data):
        if node.value is not None:
            dot_data += f'"{id(node)}" [label="{node.value}", fillcolor="#e5813960", style="filled, rounded"];\n'
        else:
            dot_data += f'"{id(node)}" [label="{node.feature}", fillcolor="#81e33960", style="filled, rounded"];\n'
            if node.left:
                dot_data += f'"{id(node)}" -> "{id(node.left)}" [label="True"];\n'
                self._traverse_tree_for_dot(node.left, dot_data)
            if node.right:
                dot_data += f'"{id(node)}" -> "{id(node.right)}" [label="False"];\n'
                self._traverse_tree_for_dot(node.right, dot_data)


In [8]:
"""
# Initialize and fit the decision tree
tree = DecisionTree(max_depth=3)
tree.fit(X, Y)

# Define feature names and class names
feature_names = X.columns.tolist()  
class_names = Y.unique().tolist()

# Export the tree to Graphviz format
tree.export_tree_graphviz(feature_names=feature_names, class_names=class_names, file_path="decision_tree.dot")

# Render the decision tree using Graphviz
graph = graphviz.Source.from_file("decision_tree.dot")
graph.render(filename="decision_tree", format="pdf", cleanup=True)
"""


'\n# Initialize and fit the decision tree\ntree = DecisionTree(max_depth=3)\ntree.fit(X, Y)\n\n# Define feature names and class names\nfeature_names = X.columns.tolist()  \nclass_names = Y.unique().tolist()\n\n# Export the tree to Graphviz format\ntree.export_tree_graphviz(feature_names=feature_names, class_names=class_names, file_path="decision_tree.dot")\n\n# Render the decision tree using Graphviz\ngraph = graphviz.Source.from_file("decision_tree.dot")\ngraph.render(filename="decision_tree", format="pdf", cleanup=True)\n'