In [1]:
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from decision_tree import DecisionTree

# Disclaimer

### This is just an example notebook for the self implemented simple decision and regression tree algorithm. It shows most of the implemented functionality of the algorithms.

# Decision Tree

In [2]:
def create_dataset():

    # Set the random seed for reproducibility
    random.seed(42)

    # Define the number of rows
    num_rows = 15000

    # Create a dictionary to store the data for each column
    data = {
        'chest_pain': [random.choice(['Yes', 'Yes', 'No']) for _ in range(num_rows)],
        'good_blood_circulation': [random.choice(['Yes', 'No', 'No']) for _ in range(num_rows)],
        'blocked_arteries': [random.choice(['Yes', 'No', 'No']) for _ in range(num_rows)],
        'overweight': [random.choice(['Yes', 'Yes', 'No']) for _ in range(num_rows)],
        'high_blood_pressure': [random.choice(['Yes', 'Yes', 'Yes', 'No']) for _ in range(num_rows)],
        'heart_disease': [random.choice(['Yes', 'No']) for _ in range(num_rows)]
    }

    # Create a pandas DataFrame from the dictionary
    df = pd.DataFrame(data)

    return df

In [3]:
df = create_dataset()
df.head()

Unnamed: 0,chest_pain,good_blood_circulation,blocked_arteries,overweight,high_blood_pressure,heart_disease
0,No,Yes,No,Yes,No,Yes
1,Yes,No,Yes,Yes,No,No
2,Yes,Yes,No,No,No,No
3,No,No,Yes,No,No,Yes
4,Yes,No,No,Yes,Yes,No


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('heart_disease', axis=1), df['heart_disease'], test_size=0.2, random_state=42)

In [5]:
decision_tree = DecisionTree(max_depth = 4, num_min_samples = 20)
decision_tree.fit(X_train, y_train)

In [6]:
decision_tree.grow_tree()

In [7]:
decision_tree.print_tree()

Node Type: root
   | Best Feature: blocked_arteries
   | GINI impurity of the node: 0.5
   | Class distribution in the node: Counter({'Yes': 6050, 'No': 5950})
   | Predicted class: Yes
   | Number of samples in the node: 12000
   | Leaf node: No

|-------- Node Type: left at depth 1
           | Best Feature: high_blood_pressure
           | GINI impurity of the node: 0.5
           | Class distribution in the node: Counter({'Yes': 2040, 'No': 1947})
           | Predicted class: Yes
           | Number of samples in the node: 3987
           | Leaf node: No

|---------------- Node Type: left at depth 2
                   | Best Feature: good_blood_circulation
                   | GINI impurity of the node: 0.5
                   | Class distribution in the node: Counter({'Yes': 1519, 'No': 1463})
                   | Predicted class: Yes
                   | Number of samples in the node: 2982
                   | Leaf node: No

|------------------------ Node Type: left at depth 3
  

In [8]:
predictions = decision_tree.predict(df.drop('heart_disease', axis=1))

# compare the predictions to the actual values
accuracy = (predictions == df['heart_disease']).mean()
print(f"The accuracy of the model is {accuracy * 100:.2f}%")

The accuracy of the model is 51.34%


# Regression Tree

In [9]:
df = pd.read_csv("../datasets/drug200.csv")

# get unique values for each column
for col in df.columns:
    print(col, df[col].unique())
    print()

Age [23 47 28 61 22 49 41 60 43 34 74 50 16 69 32 57 63 48 33 31 39 45 18 65
 53 46 15 73 58 66 37 68 67 62 24 26 40 38 29 17 54 70 36 19 64 59 51 42
 56 20 72 35 52 55 30 21 25]

Sex ['F' 'M']

BP ['HIGH' 'LOW' 'NORMAL']

Cholesterol ['HIGH' 'NORMAL']

Na_to_K [25.355 13.093 10.114  7.798 18.043  8.607 16.275 11.037 15.171 19.368
 11.767 19.199 15.376 20.942 12.703 15.516 11.455 13.972  7.298 25.974
 19.128 25.917 30.568 15.036 33.486 18.809 30.366  9.381 22.697 17.951
  8.75   9.567 11.014 31.876 14.133  7.285  9.445 13.938  9.709  9.084
 19.221 14.239 15.79  12.26  12.295  8.107 13.091 10.291 31.686 19.796
 19.416 10.898 27.183 18.457 10.189 14.16  11.34  27.826 10.091 18.703
 29.875  9.475 20.693  8.37  13.303 27.05  12.856 10.832 24.658 24.276
 13.967 19.675 10.605 22.905 17.069 20.909 11.198 19.161 13.313 10.84
 13.934  7.761  9.712 11.326 10.067 13.935 13.597 15.478 23.091 17.211
 16.594 15.156 29.45  29.271 15.015 11.424 38.247 25.395 35.639 16.725
 11.871 12.854 13.127  8.966 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Drug', axis=1), df['Drug'], test_size=0.2, random_state=42)