In [2]:
# Importing Libraries
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

In [5]:
# Load Dataset
data = load_breast_cancer()
X = data.data
y = data.target

In [6]:
print("Dataset Shape: ",X.shape)
print("Classes: ",np.unique(y))

Dataset Shape:  (569, 30)
Classes:  [0 1]


In [7]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression

In [8]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Create and train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

In [10]:
# Predictions
model_pred = model.predict(X_test)

In [11]:
model_prob = model.predict_proba(X_test)[:,1]

# Decision Tree

In [12]:
# Create and train Decision Tree Model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

In [13]:
# Prediction
dt_pred = dt.predict(X_test)

In [14]:
dt_prob = dt.predict_proba(X_test)[:,1]

# Random Forest

In [15]:
# Create and train Random Forest Model
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

In [16]:
# Prediction
rf_pred = rf.predict(X_test)

In [17]:
rf_prob = rf.predict_proba(X_test)[:,1]

# Evaluation

In [18]:
def evaluate_model(name, y_test, y_pred, y_prob):
  print(f"\n{name} Evaluation")
  print("Accuracy: ",accuracy_score(y_test, y_pred))
  print("Precision: ",precision_score(y_test, y_pred))
  print("Recall: ",recall_score(y_test, y_pred))
  print("F1 Score: ",f1_score(y_test, y_pred))
  print("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred))
  print("\nClassification Report:\n",classification_report(y_test, y_pred))

In [19]:
evaluate_model("Logistic Regression: ",y_test, model_pred, model_prob)
evaluate_model("Decision Tree: ",y_test, dt_pred, dt_prob)
evaluate_model("Random Forest: ",y_test, rf_pred, rf_prob)


Logistic Regression:  Evaluation
Accuracy:  0.9824561403508771
Precision:  0.9861111111111112
Recall:  0.9861111111111112
F1 Score:  0.9861111111111112

Confusion Matrix:
 [[41  1]
 [ 1 71]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114


Decision Tree:  Evaluation
Accuracy:  0.9122807017543859
Precision:  0.9558823529411765
Recall:  0.9027777777777778
F1 Score:  0.9285714285714286

Confusion Matrix:
 [[39  3]
 [ 7 65]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89        42
           1       0.96      0.90      0.93        72

    accuracy                           0.91       114
   macro avg       0.90    

# Comparison Table for the Three Models

In [1]:
def comparison_table(y_test, y_pred):
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

In [20]:
results = {
    "Logistic Regression": comparison_table(y_test, model_pred),
    "Decision Tree": comparison_table(y_test, dt_pred),
    "Random Forest": comparison_table(y_test, rf_pred)
}

In [21]:
comparison_df = pd.DataFrame(results).T
comparison_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.982456,0.986111,0.986111,0.986111
Decision Tree,0.912281,0.955882,0.902778,0.928571
Random Forest,0.95614,0.958904,0.972222,0.965517


# Comparison Between Models

Logistic Regression gave the best performance with highest accuracy, precision and recall, followed by Random Forest, and Decision Tree performed least effectively with lowest metrice score

- Logistic Regression has performed well as given dataset is linearly separable but in real-world data is present in non-linear format which might effects it performance.
- Decision Tree might overfit as tree depth increases.
- Random Forest is much better choice for real-world applications as it avoids overfitting and works perfectly well with non-linear data.