In [5]:
import numpy as np

# Provided dataset
data = """Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Pass_Quality_Control
1.2,3.4,5.6,7.8,9.0,0
2.3,4.5,6.7,8.9,10.1,1
3.4,5.6,7.8,9.0,11.2,0
4.5,6.7,8.9,10.1,12.3,0
5.6,7.8,9.0,11.2,13.4,1
6.7,8.9,10.1,12.3,14.5,1
7.8,9.0,11.2,13.4,15.6,0
8.9,10.1,12.3,14.5,16.7,1
9.0,11.2,13.4,15.6,17.8,0
10.1,12.3,14.5,16.7,18.9,1"""

# Convert data to numpy array
data = np.genfromtxt(data.splitlines(), delimiter=',', skip_header=1)

# Split features (X) and labels (y)
X = data[:, :-1]
y = data[:, -1]

# Assuming X_train and y_train are your training data
# X_test is your test data

# Create a decision tree classifier
dt = DecisionTree(max_depth=5)

# Train the classifier
dt.fit(X, y)

# Print the tree structure
# dt.print_tree()


TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'

In [6]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _split_dataset(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature_index = None
        best_threshold = None
        base_entropy = self._entropy(y)
        n_features = X.shape[1]

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split_dataset(X, y, feature_index, threshold)

                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                n_total = len(y_left) + len(y_right)
                left_weight = len(y_left) / n_total
                right_weight = len(y_right) / n_total
                gain = base_entropy - (left_weight * self._entropy(y_left) +
                                       right_weight * self._entropy(y_right))

                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return np.argmax(np.bincount(y))  # Return the most common class label

        best_feature_index, best_threshold = self._find_best_split(X, y)

        if best_feature_index is None:
            return np.argmax(np.bincount(y))  # Return the most common class label

        X_left, X_right, y_left, y_right = self._split_dataset(X, y, best_feature_index, best_threshold)

        subtree = {}
        subtree['feature_index'] = best_feature_index
        subtree['threshold'] = best_threshold
        subtree['left'] = self._build_tree(X_left, y_left, depth + 1)
        subtree['right'] = self._build_tree(X_right, y_right, depth + 1)

        return subtree

    def fit(self, X, y):
        self.tree_ = self._build_tree(X, y, depth=0)

    def _print_tree(self, tree, depth=0, indent="  "):
        if isinstance(tree, int):
            print(indent * depth + "Predict:", tree)
            return

        print(indent * depth + f"Feature {tree['feature_index']} <= {tree['threshold']}")
        print(indent * (depth + 1) + "--> True:")
        self._print_tree(tree['left'], depth + 1, indent)
        print(indent * (depth + 1) + "--> False:")
        self._print_tree(tree['right'], depth + 1, indent)

    def print_tree(self):
        print("Decision Tree:")
        self._print_tree(self.tree_)

    def _predict_sample(self, x, tree):
        if isinstance(tree, int):
            return tree

        feature_index = tree['feature_index']
        threshold = tree['threshold']

        if x[feature_index] <= threshold:
            return self._predict_sample(x, tree['left'])
        else:
            return self._predict_sample(x, tree['right'])

    def predict(self, X):
        predictions = []
        for x in X:
            predictions.append(self._predict_sample(x, self.tree_))
        return np.array(predictions)


In [7]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _split_dataset(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature_index = None
        best_threshold = None
        base_entropy = self._entropy(y)
        n_features = X.shape[1]

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split_dataset(X, y, feature_index, threshold)

                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                n_total = len(y_left) + len(y_right)
                left_weight = len(y_left) / n_total
                right_weight = len(y_right) / n_total
                gain = base_entropy - (left_weight * self._entropy(y_left) +
                                       right_weight * self._entropy(y_right))

                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return int(np.argmax(np.bincount(y.astype(int))))  # Convert y to integers
            
        best_feature_index, best_threshold = self._find_best_split(X, y)

        if best_feature_index is None:
            return int(np.argmax(np.bincount(y.astype(int))))  # Convert y to integers

        X_left, X_right, y_left, y_right = self._split_dataset(X, y, best_feature_index, best_threshold)

        subtree = {}
        subtree['feature_index'] = best_feature_index
        subtree['threshold'] = best_threshold
        subtree['left'] = self._build_tree(X_left, y_left, depth + 1)
        subtree['right'] = self._build_tree(X_right, y_right, depth + 1)

        return subtree

    def fit(self, X, y):
        self.tree_ = self._build_tree(X, y, depth=0)

    def _print_tree(self, tree, depth=0, indent="  "):
        if isinstance(tree, int):
            print(indent * depth + "Predict:", tree)
            return

        print(indent * depth + f"Feature {tree['feature_index']} <= {tree['threshold']}")
        print(indent * (depth + 1) + "--> True:")
        self._print_tree(tree['left'], depth + 1, indent)
        print(indent * (depth + 1) + "--> False:")
        self._print_tree(tree['right'], depth + 1, indent)

    def print_tree(self):
        print("Decision Tree:")
        self._print_tree(self.tree_)

    def _predict_sample(self, x, tree):
        if isinstance(tree, int):
            return tree

        feature_index = tree['feature_index']
        threshold = tree['threshold']

        if x[feature_index] <= threshold:
            return self._predict_sample(x, tree['left'])
        else:
            return self._predict_sample(x, tree['right'])

    def predict(self, X):
        predictions = []
        for x in X:
            predictions.append(self._predict_sample(x, self.tree_))
        return np.array(predictions)

# Provided dataset
data = """Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Pass_Quality_Control
1.2,3.4,5.6,7.8,9.0,0
2.3,4.5,6.7,8.9,10.1,1
3.4,5.6,7.8,9.0,11.2,0
4.5,6.7,8.9,10.1,12.3,0
5.6,7.8,9.0,11.2,13.4,1
6.7,8.9,10.1,12.3,14.5,1
7.8,9.0,11.2,13.4,15.6,0
8.9,10.1,12.3,14.5,16.7,1
9.0,11.2,13.4,15.6,17.8,0
10.1,12.3,14.5,16.7,18.9,1"""

# Convert data to numpy array
data = np.genfromtxt(data.splitlines(), delimiter=',', skip_header=1)

# Split features (X) and labels (y)
X = data[:, :-1]
y = data[:, -1]

# Create a decision tree classifier
dt = DecisionTree(max_depth=5)

# Train the classifier
dt.fit(X, y)

# Print the tree structure
dt.print_tree()



Decision Tree:
Feature 0 <= 4.5
  --> True:
  Feature 0 <= 2.3
    --> True:
    Feature 0 <= 1.2
      --> True:
      Predict: 0
      --> False:
      Predict: 1
    --> False:
    Predict: 0
  --> False:
  Feature 0 <= 6.7
    --> True:
    Predict: 1
    --> False:
    Feature 0 <= 7.8
      --> True:
      Predict: 0
      --> False:
      Feature 0 <= 8.9
        --> True:
        Predict: 1
        --> False:
        Feature 0 <= 9.0
          --> True:
          Predict: 0
          --> False:
          Predict: 1


In [9]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _split_dataset(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature_index = None
        best_threshold = None
        base_entropy = self._entropy(y)
        n_features = X.shape[1]

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split_dataset(X, y, feature_index, threshold)

                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                n_total = len(y_left) + len(y_right)
                left_weight = len(y_left) / n_total
                right_weight = len(y_right) / n_total
                gain = base_entropy - (left_weight * self._entropy(y_left) +
                                       right_weight * self._entropy(y_right))

                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold, best_gain, base_entropy

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            most_common_label = np.argmax(np.bincount(y.astype(int)))  # Convert y to integers
            print(f"Leaf Node: Predicted Class: {most_common_label}")
            return most_common_label
            
        best_feature_index, best_threshold, best_gain, base_entropy = self._find_best_split(X, y)
        
        print(f"Node at depth {depth}:")
        print(f"  - Base Entropy: {base_entropy:.4f}")
        print(f"  - Best Gain: {best_gain:.4f}")
        print(f"  - Best Feature Index: {best_feature_index}")
        print(f"  - Best Threshold: {best_threshold:.4f}")

        if best_feature_index is None:
            most_common_label = np.argmax(np.bincount(y.astype(int)))  # Convert y to integers
            print(f"  - Leaf Node: Predicted Class: {most_common_label}")
            return most_common_label

        X_left, X_right, y_left, y_right = self._split_dataset(X, y, best_feature_index, best_threshold)

        subtree = {}
        subtree['feature_index'] = best_feature_index
        subtree['threshold'] = best_threshold
        subtree['left'] = self._build_tree(X_left, y_left, depth + 1)
        subtree['right'] = self._build_tree(X_right, y_right, depth + 1)

        return subtree

    def fit(self, X, y):
        self.tree_ = self._build_tree(X, y, depth=0)

    def _print_tree(self, tree, depth=0, indent="  "):
        if isinstance(tree, int):
            print(indent * depth + "Predict:", tree)
            return

        print(indent * depth + f"Feature {tree['feature_index']} <= {tree['threshold']}")
        print(indent * (depth + 1) + "--> True:")
        self._print_tree(tree['left'], depth + 1, indent)
        print(indent * (depth + 1) + "--> False:")
        self._print_tree(tree['right'], depth + 1, indent)

    def print_tree(self):
        print("Decision Tree:")
        self._print_tree(self.tree_)

    def _predict_sample(self, x, tree):
        if isinstance(tree, int):
            return tree

        feature_index = tree['feature_index']
        threshold = tree['threshold']

        if x[feature_index] <= threshold:
            return self._predict_sample(x, tree['left'])
        else:
            return self._predict_sample(x, tree['right'])

    def predict(self, X):
        predictions = []
        for x in X:
            predictions.append(self._predict_sample(x, self.tree_))
        return np.array(predictions)

# Provided dataset
data = """Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Pass_Quality_Control
1.2,3.4,5.6,7.8,9.0,0
2.3,4.5,6.7,8.9,10.1,1
3.4,5.6,7.8,9.0,11.2,0
4.5,6.7,8.9,10.1,12.3,0
5.6,7.8,9.0,11.2,13.4,1
6.7,8.9,10.1,12.3,14.5,1
7.8,9.0,11.2,13.4,15.6,0
8.9,10.1,12.3,14.5,16.7,1
9.0,11.2,13.4,15.6,17.8,0
10.1,12.3,14.5,16.7,18.9,1"""

# Convert data to numpy array
data = np.genfromtxt(data.splitlines(), delimiter=',', skip_header=1)

# Split features (X) and labels (y)
X = data[:, :-1]
y = data[:, -1]

# Create a decision tree classifier
dt = DecisionTree(max_depth=5)

# Train the classifier
dt.fit(X, y)

# Print the tree structure
# dt.print_tree()



Node at depth 0:
  - Base Entropy: 1.0000
  - Best Gain: 0.1245
  - Best Feature Index: 0
  - Best Threshold: 4.5000
Node at depth 1:
  - Base Entropy: 0.8113
  - Best Gain: 0.3113
  - Best Feature Index: 0
  - Best Threshold: 2.3000
Node at depth 2:
  - Base Entropy: 1.0000
  - Best Gain: 1.0000
  - Best Feature Index: 0
  - Best Threshold: 1.2000
Leaf Node: Predicted Class: 0
Leaf Node: Predicted Class: 1
Leaf Node: Predicted Class: 0
Node at depth 1:
  - Base Entropy: 0.9183
  - Best Gain: 0.2516
  - Best Feature Index: 0
  - Best Threshold: 6.7000
Leaf Node: Predicted Class: 1
Node at depth 2:
  - Base Entropy: 1.0000
  - Best Gain: 0.3113
  - Best Feature Index: 0
  - Best Threshold: 7.8000
Leaf Node: Predicted Class: 0
Node at depth 3:
  - Base Entropy: 0.9183
  - Best Gain: 0.2516
  - Best Feature Index: 0
  - Best Threshold: 8.9000
Leaf Node: Predicted Class: 1
Node at depth 4:
  - Base Entropy: 1.0000
  - Best Gain: 1.0000
  - Best Feature Index: 0
  - Best Threshold: 9.0000
L

In [11]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.feature_names = None

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _split_dataset(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature_index = None
        best_threshold = None
        base_entropy = self._entropy(y)
        n_features = X.shape[1]

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split_dataset(X, y, feature_index, threshold)

                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                n_total = len(y_left) + len(y_right)
                left_weight = len(y_left) / n_total
                right_weight = len(y_right) / n_total
                gain = base_entropy - (left_weight * self._entropy(y_left) +
                                       right_weight * self._entropy(y_right))

                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        if self.feature_names is not None:
            best_feature_name = self.feature_names[best_feature_index]
        else:
            best_feature_name = f"Feature_{best_feature_index}"
            
        return best_feature_index, best_threshold, best_gain, base_entropy, best_feature_name

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            most_common_label = np.argmax(np.bincount(y.astype(int)))  # Convert y to integers
            print(f"Leaf Node: Predicted Class: {most_common_label}")
            return most_common_label
            
        best_feature_index, best_threshold, best_gain, base_entropy, best_feature_name = self._find_best_split(X, y)
        
        print(f"Node at depth {depth}:")
        print(f"  - Base Entropy: {base_entropy:.4f}")
        print(f"  - Best Gain: {best_gain:.4f}")
        print(f"  - Best Feature: {best_feature_name} <= {best_threshold:.4f}")

        if best_feature_index is None:
            most_common_label = np.argmax(np.bincount(y.astype(int)))  # Convert y to integers
            print(f"  - Leaf Node: Predicted Class: {most_common_label}")
            return most_common_label

        X_left, X_right, y_left, y_right = self._split_dataset(X, y, best_feature_index, best_threshold)

        subtree = {}
        subtree['feature_index'] = best_feature_index
        subtree['threshold'] = best_threshold
        subtree['left'] = self._build_tree(X_left, y_left, depth + 1)
        subtree['right'] = self._build_tree(X_right, y_right, depth + 1)

        return subtree

    def fit(self, X, y, feature_names=None):
        self.feature_names = feature_names
        self.tree_ = self._build_tree(X, y, depth=0)

    def _print_tree(self, tree, depth=0, indent="  "):
        if isinstance(tree, int):
            print(indent * depth + "Predict:", tree)
            return

        print(indent * depth + f"{self.feature_names[tree['feature_index']]} <= {tree['threshold']}")
        print(indent * (depth + 1) + "--> True:")
        self._print_tree(tree['left'], depth + 1, indent)
        print(indent * (depth + 1) + "--> False:")
        self._print_tree(tree['right'], depth + 1, indent)

    def print_tree(self):
        print("Decision Tree:")
        self._print_tree(self.tree_)

    def _predict_sample(self, x, tree):
        if isinstance(tree, int):
            return tree

        feature_index = tree['feature_index']
        threshold = tree['threshold']

        if x[feature_index] <= threshold:
            return self._predict_sample(x, tree['left'])
        else:
            return self._predict_sample(x, tree['right'])

    def predict(self, X):
        predictions = []
        for x in X:
            predictions.append(self._predict_sample(x, self.tree_))
        return np.array(predictions)

# Provided dataset
data = """Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Pass_Quality_Control
1.2,3.4,5.6,7.8,9.0,0
2.3,4.5,6.7,8.9,10.1,1
3.4,5.6,7.8,9.0,11.2,0
4.5,6.7,8.9,10.1,12.3,0
5.6,7.8,9.0,11.2,13.4,1
6.7,8.9,10.1,12.3,14.5,1
7.8,9.0,11.2,13.4,15.6,0
8.9,10.1,12.3,14.5,16.7,1
9.0,11.2,13.4,15.6,17.8,0
10.1,12.3,14.5,16.7,18.9,1"""

# Convert data to numpy array
data = np.genfromtxt(data.splitlines(), delimiter=',', skip_header=1)

# Split features (X) and labels (y)
X = data[:, :-1]
y = data[:, -1]

# Feature names
feature_names = ["Feature_1", "Feature_2", "Feature_3", "Feature_4", "Feature_5"]

# Create a decision tree classifier
dt = DecisionTree(max_depth=5)

# Train the classifier
dt.fit(X, y, feature_names=feature_names)

# Print the tree structure
# dt.print_tree()


Node at depth 0:
  - Base Entropy: 1.0000
  - Best Gain: 0.1245
  - Best Feature: Feature_1 <= 4.5000
Node at depth 1:
  - Base Entropy: 0.8113
  - Best Gain: 0.3113
  - Best Feature: Feature_1 <= 2.3000
Node at depth 2:
  - Base Entropy: 1.0000
  - Best Gain: 1.0000
  - Best Feature: Feature_1 <= 1.2000
Leaf Node: Predicted Class: 0
Leaf Node: Predicted Class: 1
Leaf Node: Predicted Class: 0
Node at depth 1:
  - Base Entropy: 0.9183
  - Best Gain: 0.2516
  - Best Feature: Feature_1 <= 6.7000
Leaf Node: Predicted Class: 1
Node at depth 2:
  - Base Entropy: 1.0000
  - Best Gain: 0.3113
  - Best Feature: Feature_1 <= 7.8000
Leaf Node: Predicted Class: 0
Node at depth 3:
  - Base Entropy: 0.9183
  - Best Gain: 0.2516
  - Best Feature: Feature_1 <= 8.9000
Leaf Node: Predicted Class: 1
Node at depth 4:
  - Base Entropy: 1.0000
  - Best Gain: 1.0000
  - Best Feature: Feature_1 <= 9.0000
Leaf Node: Predicted Class: 0
Leaf Node: Predicted Class: 1
