<a href="https://colab.research.google.com/github/gbarbosa99/Decision-Tree-From-Scratch/blob/main/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import math
import numpy as np
from collections import Counter

class Node:
    def __init__(self, X, Y, i=None, C=None, isleaf=False, p=None):
        self.X = X
        self.Y = Y
        self.i = i
        self.C = C if C is not None else {}
        self.isleaf = isleaf
        self.p = p

class Tree:
    @staticmethod
    def entropy(Y):
        value_counts = Counter(Y)
        total_count = len(Y)
        e = 0
        for count in value_counts.values():
            probability = count / total_count
            e -= probability * math.log2(probability)
        return e

    @staticmethod
    def conditional_entropy(Y, X):
        unique_x_values = np.unique(X)
        total_count = len(X)
        ce = 0
        for x_value in unique_x_values:
            subset_Y = [Y[i] for i in range(total_count) if X[i] == x_value]
            subset_entropy = Tree.entropy(subset_Y)
            weight = len(subset_Y) / total_count
            ce += weight * subset_entropy
        return ce

    @staticmethod
    def information_gain(Y, X):
        initial_entropy = Tree.entropy(Y)
        conditional_entropy = Tree.conditional_entropy(Y, X)
        g = initial_entropy - conditional_entropy
        return g

    @staticmethod
    def best_attribute(X, Y):
        num_attributes = X.shape[0]
        best_gain = -1
        best_attr = -1
        for i in range(num_attributes):
            gain = Tree.information_gain(Y, X[i])
            if gain > best_gain:
                best_gain = gain
                best_attr = i
            elif gain == best_gain and i < best_attr:
                best_attr = i
        return best_attr

    @staticmethod
    def split(X, Y, i):
        unique_values = np.unique(X[i])
        C = {}
        for value in unique_values:
            indices = [j for j in range(len(X[i])) if X[i][j] == value]
            sub_X = X[:, indices]
            sub_Y = Y[indices]
            C[value] = Node(sub_X, sub_Y)
        return C

    @staticmethod
    def stop1(Y):
        return np.all(Y == Y[0])

    @staticmethod
    def stop2(X):
        if X.ndim == 1:
            return True
        return np.all(X == X[:, 0][:, np.newaxis], axis=1).all()

    @staticmethod
    def most_common(Y):
        label_counts = Counter(Y)
        return label_counts.most_common(1)[0][0]

    @staticmethod
    def build_tree(t):
        if Tree.stop1(t.Y):
            t.isleaf = True
            t.p = t.Y[0]
            return

        if Tree.stop2(t.X):
            t.isleaf = True
            t.p = Tree.most_common(t.Y)
            return

        best_attr = Tree.best_attribute(t.X, t.Y)
        t.i = best_attr
        t.C = Tree.split(t.X, t.Y, best_attr)

        for value, child_node in t.C.items():
            Tree.build_tree(child_node)
            t.C[value] = child_node

        t.p = Tree.most_common(t.Y)

    @staticmethod
    def train(X, Y):
        t = Node(X, Y)
        Tree.build_tree(t)
        return t

    @staticmethod
    def inference(t, x):
        if t.isleaf:
            return np.array([t.p])

        attribute_value = x[t.i]
        child_node = t.C.get(attribute_value)

        if child_node is None:
            return np.array([t.p])

        return Tree.inference(child_node, x)

    @staticmethod
    def predict(t, X):
        predictions = [Tree.inference(t, X[:, i])[0] for i in range(X.shape[1])]
        return np.array(predictions)

    @staticmethod
    def load_dataset(filename):
        with open(filename, 'r') as file:
            data = file.readlines()

        data = [line.strip().split() for line in data]
        header = data[0]
        data = data[1:]

        Y = np.array([row[-1] for row in data])
        X = np.array([[row[i] for row in data] for i in range(1, len(header) - 1)])
        return X, Y

# Redirect output to a file
output_file = 'output.txt'
sys.stdout = open(output_file, 'w')

# Load the credit dataset
filename = 'credit.txt'
X, Y = Tree.load_dataset(filename)

# Ensure the dataset is loaded correctly
print("X shape:", X.shape)
print("Y shape:", Y.shape)
print("X:", X)
print("Y:", Y)

# Train the decision tree
root = Tree.train(X, Y)

# Visualize the decision tree
def print_tree(node, level=0):
    if node.isleaf:
        print(f"{'|   ' * level}Prediction: {node.p}")
    else:
        print(f"{'|   ' * level}Attribute {node.i}")
        for value, child in node.C.items():
            print(f"{'|   ' * (level + 1)}Value: {value}")
            print_tree(child, level + 2)

print_tree(root)

# Predict the credit risk of Tom and Ana
tom = np.array(['low', 'low', 'no', 'yes', 'male'])
ana = np.array(['low', 'medium', 'yes', 'yes', 'female'])

tom_prediction = Tree.inference(root, tom)
ana_prediction = Tree.inference(root, ana)

print(f"Tom's predicted credit risk: {tom_prediction[0]}")
print(f"Ana's predicted credit risk: {ana_prediction[0]}")

# Modify Sofia's credit risk
def modify_dataset(data):
    modified_data = []
    for line in data:
        if 'Sofia' in line:
            line = line.replace('low', 'high')
        modified_data.append(line)
    return modified_data

# Read the dataset and modify Sofia's credit risk
with open(filename, 'r') as file:
    data = file.readlines()
data = modify_dataset(data)

# Save the modified dataset to a new file
modified_filename = 'modified_credit.txt'
with open(modified_filename, 'w') as file:
    file.writelines(data)

# Load the modified dataset
X_modified, Y_modified = Tree.load_dataset(modified_filename)

# Ensure the modified dataset is loaded correctly
print("Modified X shape:", X_modified.shape)
print("Modified Y shape:", Y_modified.shape)
print("Modified X:", X_modified)
print("Modified Y:", Y_modified)

# Train the decision tree with the modified dataset
root_modified = Tree.train(X_modified, Y_modified)

# Visualize the modified decision tree
print_tree(root_modified)

# Close the output file
sys.stdout.close()


In [None]:
pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=328038ff33496c8a6345a73ec0d04cfc75b457dc9cb0b1feb6ddcfb520d42999
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [None]:
from fpdf import FPDF

class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Decision Tree Output', 0, 1, 'C')

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(10)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

# Create a PDF document
pdf = PDF()
pdf.add_page()

# Load the text file and add its contents to the PDF
with open('output.txt', 'r') as file:
    content = file.read()

pdf.chapter_body(content)

# Save the PDF to a file
pdf.output('output.pdf')


''