In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
# features and target
x = df.drop('Outcome',axis = 1).values
y = df['Outcome'].values

In [None]:
# splitting the data into training and testing.

# shuffled the data using indices.
# split : 80% training, 20% testing
# pick those rows for training/testing using indexing.

np.random.seed(42) # ensures the random results are reproducible
indices = np.random.permutation(len(x)) # returns a randomly shuffled array of numbers from 0 to len(x) - 1.
split_point = int(0.8 * len(x))
train_idx, test_idx = indices[:split_point], indices[split_point:]
x_train, y_train = x[train_idx],y[train_idx]
x_test,y_test = x[test_idx], y[test_idx]

In [None]:
def split(x,y,feature_index, threshold):
  left_mask = x[:,feature_index] <= threshold
  right_mask = x[:,feature_index] > threshold
  return x[left_mask],y[left_mask],x[right_mask],y[right_mask]

In [None]:
def gini_impurity(y):
  classes,counts = np.unique(y,return_counts=True)
  probability = counts / counts.sum()
  return 1 - np.sum(probability**2)

In [None]:
def best_split(x,y):
  best_feature = None
  best_threshold = None
  best_gini = float('inf')

  for feature_index in range(x.shape[1]):
    thresholds = np.unique(x[:,feature_index])
    for t in thresholds:
      x_left,y_left,x_right,y_right = split(x,y,feature_index,t)
      if len(y_left) == 0 or len(y_right) == 0:
        continue

      gini_left = gini_impurity(y_left)
      gini_right = gini_impurity(y_right)
      total_gini = ((len(y_left) * gini_left) + (len(y_right) * gini_right))/len(y)

      if total_gini < best_gini:
        best_gini = total_gini
        best_feature = feature_index
        best_threshold = t
  return best_feature ,best_threshold

In [None]:
# class node
class node:
  def __init__(self = None, feature_index = None, threshold = None, left = None, right = None, value = None):
    self.feature_index = feature_index
    self.threshold = threshold
    self.left = left
    self.right = right
    self.value = value

In [None]:
# Tree fn
def build_tree(x,y,depth=0,max_depth = 5):

  if len(np.unique(y)) == 1:   # pure case...all leaf node have same values
    return node(value = y[0])

  if len(y) == 0:
    return None

  if depth == max_depth:
    majority = np.bincount(y).argmax()
    return node(value = majority)

  feature_index, threshold = best_split(x,y) # best case


  if feature_index is None:
    majority = np.bincount(y).argmax()
    return node(value = majority)

  x_left,y_left, x_right, y_right = split(x,y,feature_index,threshold)

  left_node = build_tree(x_left,y_left, depth + 1, max_depth)
  right_node = build_tree(x_right,y_right, depth + 1, max_depth)

  return node(feature_index, threshold, left_node, right_node)

In [None]:
# prediction fn
def predict_one(node, input):
  while node.value is None:
    if input[node.feature_index] <= node.threshold:
      node = node.left
    else:
      node = node.right
  return node.value


def predict_All(node,x):
  return np.array([predict_one(node,input) for input in x])

In [None]:
# accuracy
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [None]:
# ----------------------------
# Train and Test
# ----------------------------
tree = build_tree(x_train, y_train, max_depth=5)
y_pred = predict_All(tree, x_test)
acc = accuracy(y_test, y_pred)

print("Accuracy on Test Set:", acc)

Accuracy on Test Set: 0.7532467532467533


In [None]:
# Input test: manually provide values (must match feature count)
user_input = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])  # Example

user_prediction = predict_All(tree, user_input)
print("Non-diabetic(0) | Diabetic(1)")
print(user_prediction[0])

Non-diabetic(0) | Diabetic(1)
1
