In [1]:
#  Loading and Preprocessing

# Imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
cancer = load_breast_cancer()
df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('target', axis=1))
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 # Preprocessing Explanation:
 #    Missing Values: No missing data was found.
 #    Feature Scaling: Standardization was used because:
 #    Algorithms like SVM, k-NN, and Logistic Regression are sensitive to the scale of input features.
 #    It helps models converge faster and perform more accurately.
#----------------------------------------------------------------------------------------------------------

# Classification Algorithm Implementation

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Store results
results = {}

# Helper function to evaluate models
def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

#1. Logistic Regression

evaluate_model("Logistic Regression", LogisticRegression(max_iter=1000))
   
# Description: A linear model for binary classification; predicts probability using a logistic function.
# Why it's suitable: Simple, interpretable, and effective on linearly separable data like this.

#2.  Decision Tree Classifier

evaluate_model("Decision Tree", DecisionTreeClassifier(random_state=42))

# Description: Splits data based on feature values into branches to make decisions.
# Why it's suitable: Easy to interpret and can handle non-linear relationships.

#3. Random Forest Classifier
evaluate_model("Random Forest", RandomForestClassifier(random_state=42))

# Description: An ensemble of decision trees; reduces overfitting by averaging predictions.
# Why it's suitable: High accuracy and robustness on a variety of datasets.

#4. Support Vector Machine (SVM)

evaluate_model("SVM", SVC())

# Description: Finds the best hyperplane to separate classes with maximum margin.
# Why it's suitable: Very effective in high-dimensional spaces.

# 5. k-Nearest Neighbors (k-NN)

evaluate_model("k-NN", KNeighborsClassifier())

# Description: Classifies based on the most common label among the k closest samples.
# Why it's suitable: Non-parametric and simple, good with small-to-medium datasets.

#------------------------------------------------------------------------------------------------------------

# Model Comparison 

# Display comparison
results_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
results_df = results_df.sort_values(by="Accuracy", ascending=False)
print("\nModel Comparison:")
print(results_df)

#------------------------------------------------------------------------------------------------------------

# Conclusion:
# Best Performing: Random Forest — because of its ensemble approach and ability to generalize well.
# Worst Performing: Decision Tree — tends to overfit without pruning or ensemble techniques.

Missing values:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

Logistic Regression Accuracy: 0.9737
[[41  2]
 [ 1 70]]
              