In [2]:
from numba import jit, njit, vectorize, float64, int64, boolean
import random
import warnings
from typing import Tuple, Iterable
from sting.data import FeatureType

import numpy as np

import sys
import os

# Get the current directory
notebook_dir = os.getcwd()

# The src directory is one level up from the notebook directory
src_dir = os.path.join(notebook_dir, '..', 'src')

# Add src to the system path
sys.path.append(src_dir)

# Now you can import the dtree module or any other modules in the src directory
import dtree
import optimized_dtree

import util
import optimized_util

from sting.data import Feature, FeatureType
from sting.data import parse_c45

In [3]:
# Define the path to your data
data_path = "../440data/toy"  # replace with your data path

path = os.path.expanduser(data_path).split(os.sep)
file_base = path[-1]  # -1 accesses the last entry of an iterable in Python

# Load your data
# This function might need to be replaced depending on how your data is formatted
schema, X, y = parse_c45(file_base, data_path)

print(schema)

%time decision_tree = dtree.dtree(data_path, 5) 

#dtree('../440data/toy', 5)  # Obviously this will cause an error because you haven't implemented it yet!


[Feature(name='Color', ftype=FeatureType.NOMINAL, values=[<Color.red: 1>, <Color.blue: 2>, <Color.green: 3>]), Feature(name='Area', ftype=FeatureType.CONTINUOUS), Feature(name='Shape', ftype=FeatureType.NOMINAL, values=[<Shape.circle: 1>, <Shape.triangle: 2>, <Shape.square: 3>])]
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
First Feature: Area
CPU times: user 8 ms, sys: 1.18 ms, total: 9.18 ms
Wall time: 9.63 ms


In [4]:
# Testing single entropy calculation
%time label_entropy = util.calculate_entropy(y)
print(label_entropy)

CPU times: user 330 µs, sys: 95 µs, total: 425 µs
Wall time: 367 µs
0.9182958340544896


In [5]:
# Testing multiple entropy calculation
%time entropies = util.calculate_column_entropy(schema, X, y, decision_tree._determine_split_criterion(X, y, schema))
print(entropies)

CPU times: user 1.46 ms, sys: 904 µs, total: 2.36 ms
Wall time: 1.83 ms
[0.612197222702993, 0.4601899388973658, 0.612197222702993]


In [6]:
# Testing infogain calculation
%time infogains = util.infogain(schema, X, y, decision_tree._determine_split_criterion(X, y, schema))
print(infogains)

CPU times: user 2.62 ms, sys: 2.36 ms, total: 4.98 ms
Wall time: 3.59 ms
[0.30609861 0.4581059  0.30609861]


In [9]:
# Testing optimized entropy calculation
%time label_entropy = optimized_util.calculate_entropy(y)
print(label_entropy)

CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 21 µs
0.9182958340544896


In [12]:
# Testing optimized multiple entropy calculation
%time entropies = optimized_util.calculate_column_entropy(schema, X, y, decision_tree._determine_split_criterion(X, y, schema))
print(entropies)

CPU times: user 365 µs, sys: 201 µs, total: 566 µs
Wall time: 502 µs
[0.612197222702993, 0.4601899388973658, 0.612197222702993]


In [14]:
# Testing optimized infogain calculation
%time infogains = optimized_util.infogain(schema, X, y, decision_tree._determine_split_criterion(X, y, schema))
print(infogains)

CPU times: user 338 µs, sys: 126 µs, total: 464 µs
Wall time: 401 µs
[0.30609861 0.4581059  0.30609861]


In [18]:
# Define the path to your data
data_path = "../440data/toy"  # replace with your data path

path = os.path.expanduser(data_path).split(os.sep)
file_base = path[-1]  # -1 accesses the last entry of an iterable in Python

# Load your data
# This function might need to be replaced depending on how your data is formatted
schema, X, y = parse_c45(file_base, data_path)

print(schema)

%time optimized_decision_tree = optimized_dtree.dtree(data_path, 5) 

#dtree('../440data/toy', 5)  # Obviously this will cause an error because you haven't implemented it yet!


[Feature(name='Color', ftype=FeatureType.NOMINAL, values=[<Color.red: 1>, <Color.blue: 2>, <Color.green: 3>]), Feature(name='Area', ftype=FeatureType.CONTINUOUS), Feature(name='Shape', ftype=FeatureType.NOMINAL, values=[<Shape.circle: 1>, <Shape.triangle: 2>, <Shape.square: 3>])]
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
First Feature: Area
CPU times: user 5.6 ms, sys: 362 µs, total: 5.96 ms
Wall time: 5.93 ms
