In [None]:
import numpy as np

# Class for representing a Decision Tree Node
class DecisionTreeNode:
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx  # Index of the feature for splitting
        self.threshold = threshold  # Threshold value for splitting
        self.left = left  # Left child node
        self.right = right  # Right child node
        self.value = value  # Value (class label) for leaf node

# Function to split the data based on a threshold
def split_data(X, y, feature_idx, threshold):
    left_mask = X[:, feature_idx] <= threshold
    right_mask = ~left_mask
    X_left, y_left = X[left_mask], y[left_mask]
    X_right, y_right = X[right_mask], y[right_mask]
    return X_left, y_left, X_right, y_right

# Function to calculate the entropy
def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Function to calculate the information gain
def information_gain(X, y, feature_idx, threshold):
    X_left, y_left, X_right, y_right = split_data(X, y, feature_idx, threshold)
    n = len(y)
    left_weight = len(y_left) / n
    right_weight = len(y_right) / n
    gain = entropy(y) - (left_weight * entropy(y_left) + right_weight * entropy(y_right))
    return gain

# Function to build the decision tree recursively
def decision_tree_basic(X, y):
    # Base case: if all samples have the same class label
    if len(np.unique(y)) == 1:
        return DecisionTreeNode(value=np.unique(y)[0])

    # Base case: if no features are remaining
    if X.shape[1] == 0:
        return DecisionTreeNode(value=np.bincount(y).argmax())

    best_gain = 0
    best_feature_idx = None
    best_threshold = None

    n_features = X.shape[1]
    for feature_idx in range(n_features):
        thresholds = np.unique(X[:, feature_idx])
        for threshold in thresholds:
            current_gain = information_gain(X, y, feature_idx, threshold)
            if current_gain > best_gain:
                best_gain = current_gain
                best_feature_idx = feature_idx
                best_threshold = threshold

    if best_feature_idx is None:
        return DecisionTreeNode(value=np.bincount(y).argmax())

    X_left, y_left, X_right, y_right = split_data(X, y, best_feature_idx, best_threshold)

    # Build left and right subtrees recursively
    left_subtree = decision_tree_basic(X_left, y_left)
    right_subtree = decision_tree_basic(X_right, y_right)

    return DecisionTreeNode(
        feature_idx=best_feature_idx,
        threshold=best_threshold,
        left=left_subtree,
        right=right_subtree
    )

# Function to make predictions using the decision tree
def predict(x, tree):
    if tree.value is not None:
        return tree.value

    if x[tree.feature_idx] <= tree.threshold:
        return predict(x, tree.left)
    else:
        return predict(x,tree.right)


In [None]:
import pandas as pd

df = pd.read_csv('/content/diabetes (1).csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
X = df.drop(columns='Outcome').to_numpy()
y = df['Outcome'].to_numpy()

In [None]:
import numpy as np
import time

start = time.time()

tree = decision_tree_basic(X, y)

end = time.time()
print(f'waktu yang diperlukan untuk menjalankan code secara serial adalah : {end-start} detik')


waktu yang diperlukan untuk menjalankan code secara serial adalah : 3.8157615661621094 detik


In [None]:
import numpy as np
from joblib import Parallel, delayed
import multiprocessing

# Kelas merepresentasikan node dari tree
class DecisionTreeNode:
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx  # index dari x
        self.threshold = threshold  # nilai treshold untuk membagi tree
        self.left = left  # node tree kiri
        self.right = right  # node tree kanan
        self.value = value  # leaf / nilai hasil akhir

# fungsi untuk membagi data berdasarkan threshold
def split_data(X, y, feature_idx, threshold):
    left_mask = X[:, feature_idx] <= threshold
    right_mask = ~left_mask
    X_left, y_left = X[left_mask], y[left_mask]
    X_right, y_right = X[right_mask], y[right_mask]
    return X_left, y_left, X_right, y_right

# fungsi untuk menghitung entropy (variasi data)
def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# fungsi untuk menghitung information gain
def information_gain(X, y, feature_idx, threshold):
    X_left, y_left, X_right, y_right = split_data(X, y, feature_idx, threshold)
    n = len(y)
    left_weight = len(y_left) / n
    right_weight = len(y_right) / n
    gain = entropy(y) - (left_weight * entropy(y_left) + right_weight * entropy(y_right))
    return gain

# fungsi untuk membuat decision tree menjadi paralel
def decision_tree_paralel(X, y):
    # jika semua sampel punya class yang sama
    if len(np.unique(y)) == 1:
        return DecisionTreeNode(value=np.unique(y)[0])

    # jika tidak ada fitur yang tersisa
    if X.shape[1] == 0:
        return DecisionTreeNode(value=np.bincount(y).argmax())

    best_gain = 0
    best_feature_idx = None
    best_threshold = None

    n_features = X.shape[1]

    def evaluate_split(feature_idx, threshold):
        current_gain = information_gain(X, y, feature_idx, threshold)
        return current_gain, feature_idx, threshold

    # Membuat paralel objek dengan jumlah core yang ada
    num_cores = multiprocessing.cpu_count()
    parallel = Parallel()

    # paralelisasi
    with parallel_backend('threading', n_jobs=num_cores):
      results = parallel(delayed(evaluate_split)(feature_idx, threshold) for feature_idx in range(n_features) for threshold in np.unique(X[:, feature_idx]))
      for result in results:
          current_gain, feature_idx, threshold = result
          if current_gain > best_gain:
              best_gain = current_gain
              best_feature_idx = feature_idx
              best_threshold = threshold

    if best_feature_idx is None:
        return DecisionTreeNode(value=np.bincount(y).argmax())

    X_left, y_left, X_right, y_right = split_data(X, y, best_feature_idx, best_threshold)

    # Membuat subtree kiri dan kanan rekursif
    left_subtree = decision_tree_paralel(X_left, y_left)
    right_subtree = decision_tree_paralel(X_right, y_right)

    return DecisionTreeNode(
    feature_idx=best_feature_idx,
    threshold=best_threshold,
    left=left_subtree,
    right=right_subtree)

def predict(x, tree):
  if tree.value is not None:
    return tree.value
  if x[tree.feature_idx] <= tree.threshold:
    return predict(x, tree.left)
  else:
    return predict(x,tree.right)


In [None]:
import numpy as np
import time


start = time.time()
# Generate random data
tree = decision_tree_paralel(X, y)
end = time.time()
print(f'waktu yang diperlukan untuk menjalankan code secara paralel adalah : {end-start} detik')


waktu yang diperlukan untuk menjalankan code secara paralel adalah : 5.413621425628662 detik


In [None]:
num_cores = multiprocessing.cpu_count()
print(num_cores)

2
