# Imports

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier



# Load and explore the data

In [2]:
# Load dataset
dataset = load_breast_cancer(return_X_y=True)
X, y = load_breast_cancer(return_X_y=True)
feature_names = load_breast_cancer().feature_names

In [3]:
feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
X = pd.DataFrame(X, columns=feature_names)

In [5]:
y = pd.Series(y)

In [6]:
y.value_counts()

Unnamed: 0,count
1,357
0,212


# Recap - working without corss validation

In [7]:
# Split into train/val/test
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42, stratify=y_val_test)

In [None]:
X_train.shape, X_val.shape, X_test.shape

((455, 30), (57, 30), (57, 30))

# Single Decision Tree

In [8]:
# Initialize and train Decision Tree
tree = DecisionTreeClassifier(
    criterion='gini',      # or 'entropy'
    max_depth=5,           # limit depth to avoid overfitting
    random_state=432
)
tree.fit(X_train, y_train)

# Predict and evaluate
y_pred = tree.predict(X_val)

print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92        23
           1       0.97      0.91      0.94        34

    accuracy                           0.93        57
   macro avg       0.92      0.93      0.93        57
weighted avg       0.93      0.93      0.93        57


Confusion Matrix:
 [[22  1]
 [ 3 31]]


# Random Forest

In [9]:
# Create and train Random Forest
rf = RandomForestClassifier(
    n_estimators=5,       # number of trees
    max_depth=None,         # let trees grow fully
    max_features='sqrt',    # random subset of features per split
    random_state=432
)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_val)

# Evaluate
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        23
           1       1.00      0.94      0.97        34

    accuracy                           0.96        57
   macro avg       0.96      0.97      0.96        57
weighted avg       0.97      0.96      0.97        57


Confusion Matrix:
 [[23  0]
 [ 2 32]]


In [10]:
# Create and train Random Forest
rf = RandomForestClassifier(
    n_estimators=10,       # number of trees
    max_depth=None,         # let trees grow fully
    max_features='sqrt',    # random subset of features per split
    random_state=432
)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_val)

# Evaluate
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        23
           1       1.00      0.94      0.97        34

    accuracy                           0.96        57
   macro avg       0.96      0.97      0.96        57
weighted avg       0.97      0.96      0.97        57


Confusion Matrix:
 [[23  0]
 [ 2 32]]


In [11]:
# Create and train Random Forest
rf = RandomForestClassifier(
    n_estimators=100,       # number of trees
    max_depth=None,         # let trees grow fully
    max_features='sqrt',    # random subset of features per split
    max_samples=None, # draw X.shape[0] samples
    random_state=432
)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_val)

# Evaluate
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        23
           1       1.00      0.94      0.97        34

    accuracy                           0.96        57
   macro avg       0.96      0.97      0.96        57
weighted avg       0.97      0.96      0.97        57


Confusion Matrix:
 [[23  0]
 [ 2 32]]


# Review separate trees

In [None]:
rf.estimators_

# Feature importances

In [12]:
# Plot top 10 features
# Create a DataFrame for importances
importances = rf.feature_importances_
feat_importance = pd.Series(importances, index=feature_names).sort_values(ascending=False)
top_10 = feat_importance[:10]  # reverse for horizontal bar chart
top_10

Unnamed: 0,0
worst perimeter,0.139171
mean concave points,0.136892
worst area,0.12802
worst concave points,0.11446
worst radius,0.069989
mean area,0.057895
mean concavity,0.050244
mean radius,0.047577
worst concavity,0.036596
mean perimeter,0.028055


In [13]:
# Plot using Plotly
fig = px.bar(
    x=top_10.values,
    y=top_10.index,
    orientation='h',
    labels={'x': 'Importance', 'y': 'Feature'},
    title='Top 10 Feature Importances (Random Forest)',
    template='plotly_white'
)

fig.update_layout(
    yaxis=dict(autorange='reversed'),  # mimic matplotlib's invert_yaxis
    margin=dict(l=100, r=20, t=40, b=40)
)

fig.show()