##### Importing necessary Libraries.

In [None]:
# Importing all necessary libraries.

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import os
import math
import scipy
import pprint

# import scikit-learn as sk

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Fetching the dataset (csv file).

df = pd.read_csv('E:\\Govind_Work_Folder\\Career247_(Data_Science_Course)\\03. Data Science Course (Career247)\\Module 4 (Machine Learning)\\DecisionTreesImplementaion\\DecisionTreesFoundations\\breast_cancer_data.csv')
df.head()

##### Step 1: EDA (Exploratory Data Analysis)

In [None]:
# 1. Check basic informations.

print("Shape:", df.shape)
print("*" * 80)

print("Columns:", df.columns)
print("*" * 80)

print(df.info())
print("*" * 80)

print(df.describe())
print("*" * 80)

In [None]:
# 2. Check for missing values.

print("Null Value Check:", df.isnull().sum())

In [None]:
# 3. Correlation matrix.

print("Correlation Matrix:")
# print(df.corr())
print(df.corr(numeric_only=True))

In [None]:
# 4. Visualize the data.

sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
# Sample data for line plot.

df10 = pd.DataFrame({
    "x-axis": [1, 2, 3, 4, 5],
    "y-axis": [10, 15, 13, 17, 20]
})

# Create line plot

fig = px.line(df10, x="x-axis", y="y-axis", title="Simple Line Plot (Using Plotly Express)")
fig.show()

In [None]:
# Sample data for scatter plot with categories.

df20 = pd.DataFrame({
    "x": [1, 2, 3, 4, 5, 6],
    "y": [10, 14, 12, 18, 22, 19],
    "category": ["A", "B", "A", "B", "A", "B"]
})

# Create scatter plot

fig = px.scatter(df20, x="x", y="y", color="category", size="y", title="Scatter Plot with Categories (Using Plotly Express)")
fig.show()

In [None]:
# Deleting columns:

print("Dropping the redundant\n")
df.drop(columns = ['id', 'Unnamed: 32'], axis = 1, inplace = True)
df.shape

In [None]:
print("Unique value of Diagnosis column in the output label: \n")
print(df['diagnosis'].unique())

In [None]:
# Output label / Target variable / Y-label : data distribution 
# pie-plot : proportion of M v/s B

px.pie(df, 
       'diagnosis',
       color = 'diagnosis',
       color_discrete_sequence = ['#007500','#5CFF5C'],
       title = "Data Distribution")

# Inferences :
# dataset is imbalanced (M : B = 63:37).
# there are more cases of benign tumors than malignant tumors.
# for imbalanced datasets, accuracy can be a misleading metric.
# for example, if 90% of the cases are benign, the model will always predict "benign".
# in such cases, we need "Balanced accuracy".

In [None]:
# visually compare the distribution of each feature.
# for malignant tumours versus bening.
# for a given feature, do its values tend to be different for malignant vs benign cases.

for column in df.drop("diagnosis", axis = 1).columns[:5]:

    # for loop auto iterates through the first five feature columns in the dataframe.

    fig = px.box(data_frame = df,
                 x = 'diagnosis',
                 color = 'diagnosis',
                 y = column,
                 color_discrete_sequence = ['#007500','#5CFF5C'],
                 orientation = 'v')
    fig.show()

In [None]:
for column in df.drop("diagnosis",axis=1).columns[5:10]:
    
    # for loop auto iterates through the first five feature columns in the dataframe
    fig = px.scatter(data_frame =df ,
                 x=column,
                 color = 'diagnosis',
                 color_discrete_sequence = ['#007500','#5CFF5C'],
                 orientation = 'v')
    fig.show()

##### Step 2: Creating co-relation with the targer variable.

In [None]:
# diagnosis : M or B :categorical
# encode : 1 or 0 :categorical

# this line converts the categorical feature into numerical
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int)

# setting M = 1  then B = 0

# take the correlation
corr = df.corr()
plt.figure(figsize = (20,20))

# heatmap 
sns.heatmap(corr , cmap = 'viridis_r' , annot = True)

plt.show()
# correlation goes between : -1 to 1

In [None]:
# We can also use it for corelation without chart.
df.corr()

##### Step 3: Feature Selection (Feature Engineering)

In [None]:
# We should now choose which features are good enough predictors to be used to train the model 
# get the absoulte correlation 

# select better correlated features
# this is the filtering step
# it creates a new list of relevant features
cor_target = abs(corr['diagnosis'])

# 0.25 is user defined. It is the hyper-parameter value
relevant_features = cor_target[cor_target > 0.25]

# collect the names of features
# list comprehension

names = [index for index,value in relevant_features.items()]

# Drop the target variable from the results
names.remove("diagnosis")

pprint.pprint(names)

##### Step 4: Assign Training Data and Training Labels

In [None]:
X = df[names].values

# this line creates target vector or a target label 
# df['diagnosis'].values : (569, 1)
y = df['diagnosis'].values.reshape(-1, 1)

print("Input features are:", X.shape,"Output Label shape is: ", y.shape)

In [None]:
# we need to scale
# Standardize / Z-score normalization
# apply on X

import numpy as np

def scale(X):
    '''
    Parameters : numpy.ndarray) 
    Returns : numpy.ndarray
    '''

    # Compute the mean and standard deviation
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)

    # Standardize this data
    X = (X - mean) / std

    return X

X = scale(X)

##### Step 5: Model Implementation

In [None]:
# We will start with all the examples at the Root Nodes
# Then we will calculate the Information Gain for each feature / Gini Index for each feature
# then we will pick the feature with the highest Information Gain / Gini Index
# then we will split the data according to selected feature
# we will repeat this process until we reach the stopping criteria

# Node Class
class Node:
    '''
    A class representing a Node in a Decision Tree.
    '''
    def __init__(self, feature=None, threshold=None, left=None, right=None,gain = None, value=None):
        '''
        Initializes a Node. 

        Parameters:
        - feature: The index of the feature to split on.
        - threshold: The threshold value for the split. Defaults to None
        - left: The left child Node. Defaults to None
        - right: The right child Node. Defaults to None
        - value: The class label if it's a leaf node.
        '''
      
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value

'''
Explanation : 
self.threshold = threshold
self.feature = feature 
The above two are used by Decision Nodes.
They store the question being asked at this node .
For example , "Is the radius_mean < 15.5 ? "

self.left = left and self.right = right
Used by decision nodes to point to the left and right child nodes.  
They are also called pointer nodes.
       
self.value = value
used  by leaf nodes to store the class label.
If a node is a final endpoint . it does not ask any questions
it holds predicted class label or prediciton for each branch
self.value will be 0(Benign) or 1(Malignant) for leaf nodes.

self.gain = gain
Used by Decision Nodes to store the Information Gain or Gini Index of the split.
'''

##### Step 6: Building the Decision Trees

In [1]:
class DecisionTree:
    '''
    This is a decision tree classifier.
    '''  

    def __init__(self,min_samples = 2 , max_depth = 3):
        self.min_samples = min_samples
        self.max_depth = max_depth
    
        "We are setting hyper-parameters to control the growth of the tree prevent overfitting"
        
    def split_data(self, dataset,feature, threshold):
        '''
        Splits the given dataset based on the feature and threshold.
        parameters:
        - dataset: The dataset to split.
        - feature  : Index of the feature to split on.
        - threshold: The threshold value for the split.
    
        Returns : 
        left_dataset : subset of data with values less than or equal to the threshold
        right_dataset : subset of data with values greater than the threshold
'''  
        # create empty arrays
        left_dataset = []
        right_dataset = []
      
        # loop through each row in the dataset in left and right basis the feature and threshold
   
        for row in dataset:
            if row[feature] <= threshold:
                left_dataset.append(row)
            else:
                right_dataset.append(row)
               
        # convert the left and right datasets into numpy arrays
        left_dataset = np.array(left_dataset)
        right_dataset = np.array(right_dataset)
     
        return left_dataset, right_dataset

##### Step 7: Entropy calculation

In [3]:
# write function to calculate Entropy

def entropy(self, y):
        '''
        Computes the entropy for given labels
        Entropy suggests impurity or disorder in the dataset.

        Returns : float : Entropy value
        '''

        entropy = 0.0
        # this initializes the entropy to zero

        # use numpy's unique function to get the unique labels in y
        labels = np.unique(y)

        for label in labels:
            # find examples in y that have the current label
            label_examples = y[y == label]

            # Calculate the ratio of current label in y
            pl = len(label_examples) / len(y)

            # calculate the entropy for the current label and ratio
            entropy += -pl * np.log2(pl)

        return entropy

##### Step 8: Gini Index / Information Gain

In [4]:
# write function to calculate Gini Index/Information Gain
   
def information_gain(self,parent,left,right):
        '''
        Computes the information gain from splitting the parent dataset into two datasets

        Parameters:
        parent(ndarray): Input parent dataset
        left: subset of parent dataset after the split on the feature
        right: subset of parent dataset after the split on the feature
       
        Returns: 
        Information Gain on the split: float
        '''

# intiialize the information gain to zero
        information_gain = 0.0
        
        # compute the entropy of the parent dataset
        parent_entropy = self.entropy(parent)
        
        # calculate the weights for left and right datasets/nodes
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        
        # compute the entropy of the left and right datasets/nodes
        entropy_left,entropy_right = self.entropy(left) , self.entropy(right)
        
        # calculate the weighted entropy 
        weighted_entropy = (weight_left * entropy_left) + (weight_right * entropy_right)
        
        # calculate the information gain
        information_gain = parent_entropy - weighted_entropy

        return information_gain