In [None]:
# Importing all necessary libraries.

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import os
import math
import scipy
import pprint

# import scikit-learn as sk

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Fetching the dataset (csv file).

df = pd.read_csv('E:\\Govind_Work_Folder\\Career247_(Data_Science_Course)\\03. Data Science Course (Career247)\\Module 4 (Machine Learning)\\DecisionTreesImplementaion\\DecisionTreesFoundations\\breast_cancer_data.csv')
df.head()

##### Step 1: EDA (Exploratory Data Analysis)

In [None]:
# 1. Check basic informations.

print("Shape:", df.shape)
print("*" * 80)

print("Columns:", df.columns)
print("*" * 80)

print(df.info())
print("*" * 80)

print(df.describe())
print("*" * 80)

In [None]:
# 2. Check for missing values.

print("Null Value Check:", df.isnull().sum())

In [None]:
# 3. Correlation matrix.

print("Correlation Matrix:")
# print(df.corr())
print(df.corr(numeric_only=True))

In [None]:
# 4. Visualize the data.

sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
# Sample data for line plot.

df10 = pd.DataFrame({
    "x-axis": [1, 2, 3, 4, 5],
    "y-axis": [10, 15, 13, 17, 20]
})

# Create line plot

fig = px.line(df10, x="x-axis", y="y-axis", title="Simple Line Plot (Using Plotly Express)")
fig.show()

In [None]:
# Sample data for scatter plot with categories.

df20 = pd.DataFrame({
    "x": [1, 2, 3, 4, 5, 6],
    "y": [10, 14, 12, 18, 22, 19],
    "category": ["A", "B", "A", "B", "A", "B"]
})

# Create scatter plot

fig = px.scatter(df20, x="x", y="y", color="category", size="y", title="Scatter Plot with Categories (Using Plotly Express)")
fig.show()

In [None]:
# Deleting columns:

print("Dropping the redundant\n")
df.drop(columns = ['id', 'Unnamed: 32'], axis = 1, inplace = True)
df.shape

In [None]:
print("Unique value of Diagnosis column in the output label: \n")
print(df['diagnosis'].unique())

In [None]:
# Output label / Target variable / Y-label : data distribution 
# pie-plot : proportion of M v/s B

px.pie(df, 
       'diagnosis',
       color = 'diagnosis',
       color_discrete_sequence = ['#007500','#5CFF5C'],
       title = "Data Distribution")

# Inferences :
# dataset is imbalanced (M : B = 63:37).
# there are more cases of benign tumors than malignant tumors.
# for imbalanced datasets, accuracy can be a misleading metric.
# for example, if 90% of the cases are benign, the model will always predict "benign".
# in such cases, we need "Balanced accuracy".

In [None]:
# visually compare the distribution of each feature.
# for malignant tumours versus bening.
# for a given feature, do its values tend to be different for malignant vs benign cases.

for column in df.drop("diagnosis", axis = 1).columns[:5]:

    # for loop auto iterates through the first five feature columns in the dataframe.

    fig = px.box(data_frame = df,
                 x = 'diagnosis',
                 color = 'diagnosis',
                 y = column,
                 color_discrete_sequence = ['#007500','#5CFF5C'],
                 orientation = 'v')
    fig.show()

In [None]:
for column in df.drop("diagnosis",axis=1).columns[5:10]:
    
    # for loop auto iterates through the first five feature columns in the dataframe
    fig = px.scatter(data_frame =df ,
                 x=column,
                 color = 'diagnosis',
                 color_discrete_sequence = ['#007500','#5CFF5C'],
                 orientation = 'v')
    fig.show()

##### Step 2: Creating corelation with the targer variable.

In [None]:
# diagnosis : M or B :categorical
# encode : 1 or 0 :categorical

# this line converts the categorical feature into numerical
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int)

# setting M = 1  then B = 0

# take the correlation
corr = df.corr()
plt.figure(figsize = (20,20))

# heatmap 
sns.heatmap(corr , cmap = 'viridis_r' , annot = True)

plt.show()
# correlation goes between : -1 to 1

In [None]:
# We can also use it for corelation without chart.
df.corr()

##### Step 3: Feature Selection (Feature Engineering)

In [None]:
# We should now choose which features are good enough predictors to be used to train the model 
# get the absoulte correlation 

# select better correlated features
# this is the filtering step
# it creates a new list of relevant features
cor_target = abs(corr['diagnosis'])


# 0.25 is user defined. It is the hyper-parameter value
relevant_features = cor_target[cor_target > 0.25]

# collect the names of features
# list comprehension

names = [index for index,value in relevant_features.items()]

# Drop the target variable from the results
names.remove("diagnosis")

pprint.pprint(names)