In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
X, y = load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = StandardScaler()

# Only features is needed to be scaled
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

clf.fit(X_train_scaled, y_train) 

# The model is trained, as can see below!

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [5]:
# Below to evaluate the model, on how well it performed.
clf.score(X_test_scaled, y_test)

# 1. What it does:
#    - Internally, `score` first calls `predict(X_test_scaled)`
#      → this gives predicted labels (ŷ) for the test data.
#    - Then, since this is a classifier, it compares those predictions (ŷ)
#      with the true labels (y_test).
#    - Finally, it computes the accuracy = (correct predictions / total samples).

# 2. Why we use X_test_scaled instead of X_test:
#    - We scaled the training data (X_train_scaled) before fitting.
#    - The KNN algorithm is distance-based → scaling ensures fair distance calculation.
#    - So we must also transform the test set (X_test_scaled) with the SAME scaler
#      (using .transform, not .fit_transform) to keep consistency.

# 3. Why only y_test is passed (not scaled):
#    - `y` (labels) are target values like 0 = malignant, 1 = benign (or vice versa).
#    - These are already categorical labels, not features.
#    - Scaling is only for features (X), not for labels (y).
#    - So we just pass y_test as-is, to compare with predicted labels.

0.9736842105263158

In [6]:
# Get the first test instance (raw features as a 1D array)
single_instance = X_test[0]        # use [0], not (0)
print(single_instance.shape)       # (30,) → 30 features for 1 patient

# ⚠️ scikit-learn requires 2D input: (n_samples, n_features)
# For a single sample, reshape or wrap in a list:
#   - clf.predict([single_instance]) 
#   - clf.predict(single_instance.reshape(1, -1))

# 👉 Why (1, -1)? 
# - 1 = number of samples (we only have 1 patient here)
# - -1 = "infer the correct number of features automatically" (here it becomes 30)
# So single_instance.reshape(1, -1) → shape (1, 30) = 1 sample, 30 features.

# Scale the sample using the SAME scaler (use .transform, not .fit_transform)
single_instance_scaled = scaler.transform(single_instance.reshape(1, -1))
print(single_instance_scaled.shape)  # (1, 30) → 1 sample, 30 features

# Predict the class (returns numeric label: 0 = malignant, 1 = benign)
pred_label = clf.predict(single_instance_scaled)
print("raw numeric prediction:", pred_label)

# Get probabilities (vote fractions from neighbors)
# Example: [[0.8 0.2]] → 80% malignant, 20% benign
proba = clf.predict_proba(single_instance_scaled)  # shape (1, n_classes)
print("predicted probabilities:", proba)

(30,)
(1, 30)
raw numeric prediction: [1]
predicted probabilities: [[0. 1.]]


In [19]:
# Test the model with one instance

# 1️⃣ Pick a sample from the test set
single_instance = X_test_scaled[3]  
# ⚠️ Important: use X_test_scaled (not X_test), because the model was trained on scaled data.
# single_instance is just the feature values for patient #1 in the test set.

# 2️⃣ Predict the label for this one patient
pred = clf.predict([single_instance])  
# - clf.predict expects 2D input: (n_samples, n_features)
# - Wrapping single_instance in [...] turns shape (30,) → (1, 30)

# 3️⃣ Get the true label from the test set
true = y_test[1]  
# - y_test stores the actual ground truth labels for each patient in the test split.
# - This is what we compare our prediction against.

# 4️⃣ Interpretation
# - If pred == true → the model got this sample correct
# - If pred != true → the model misclassified it
print("Pred: ",pred)
print("True: ",true)

Pred:  [0]
True:  0


In [16]:
# 🔄 Apart from KNeighborsClassifier, we can use other classifiers.
# Example: Logistic Regression (another supervised learning algorithm for classification).
# The steps (fit, predict, score) stay exactly the same, only the model (clf) changes.

from sklearn.linear_model import LogisticRegression  

# 1️⃣ Create the model (classifier)
clf = LogisticRegression()  

# 2️⃣ Train (fit) the model on the scaled training data
clf.fit(X_train_scaled, y_train)  

# 3️⃣ Evaluate performance on the test set
# .score() = accuracy → % of correct predictions
print("Test Accuracy:", clf.score(X_test_scaled, y_test))

# 4️⃣ Test with a single instance
single_instance = X_test_scaled[2]       # scaled test sample
pred = clf.predict([single_instance])    # model's prediction
true = y_test[1]                         # actual ground-truth label

print("Prediction:", pred, " | True label:", true)
# If pred == true → model is correct
# If pred != true → model made a misclassification

Test Accuracy: 0.956140350877193
Prediction: [0]  | True label: 0


In [18]:
# 🔄 Apart from KNeighborsClassifier and LogisticRegression, 
# we can also use DecisionTreeClassifier (a tree-based model).
# Again, the workflow (fit, score, predict) is identical — only the algorithm differs.

from sklearn.tree import DecisionTreeClassifier  

# 1️⃣ Create the model (classifier)
clf = DecisionTreeClassifier()  

# 2️⃣ Train (fit) the model on the scaled training data
# (Decision Trees don’t strictly require scaling, but using scaled data keeps consistency)
clf.fit(X_train_scaled, y_train)  

# 3️⃣ Evaluate performance on the test set
# .score() gives the accuracy (fraction of correctly predicted samples)
print("Test Accuracy:", clf.score(X_test_scaled, y_test))

# 4️⃣ Test with a single instance
single_instance = X_test_scaled[3]       # scaled test sample
pred = clf.predict([single_instance])    # model's prediction
true = y_test[1]                         # actual ground-truth label

print("Prediction:", pred, " | True label:", true)
# If pred == true → model is correct
# If pred != true → model made a misclassification

Test Accuracy: 0.9210526315789473
Prediction: [0]  | True label: 0


In [23]:
# 🔄 Apart from KNeighborsClassifier and LogisticRegression, 
# we can also use Support Vector Classifier (SVC), which tries to find the best hyperplane 
# that separates the classes with the maximum margin.
# The workflow (fit, score, predict) is still the same — only the algorithm differs.

from sklearn.svm import SVC  

# 1️⃣ Create the model (classifier)
clf = SVC(kernel='rbf', probability=True, random_state=42)  
# kernel='rbf' (default) → nonlinear boundary
# probability=True → allows probability estimates (slower, but useful for predict_proba)

# 2️⃣ Train (fit) the model on the scaled training data
# (SVMs are sensitive to feature scale, so scaling is essential)
clf.fit(X_train_scaled, y_train)  

# 3️⃣ Evaluate performance on the test set
# .score() gives the accuracy (fraction of correctly predicted samples)
print("Test Accuracy:", clf.score(X_test_scaled, y_test))

# 4️⃣ Test with a single instance
single_instance = X_test_scaled[3]       # scaled test sample
pred = clf.predict([single_instance])    # model's prediction
true = y_test[3]                         # actual ground-truth label

print("Prediction:", pred, " | True label:", true)
# If pred == true → model is correct
# If pred != true → model made a misclassification


Test Accuracy: 0.9736842105263158
Prediction: [0]  | True label: 0


In [30]:
# 🔄 Apart from KNN, Logistic Regression, Decision Tree, and SVC, 
# we can also use RandomForestClassifier — an ensemble method that builds many decision trees 
# and combines their predictions (majority vote) for higher accuracy and robustness.
# The workflow (fit, score, predict) is still the same — only the algorithm differs.

from sklearn.ensemble import RandomForestClassifier  

# 1️⃣ Create the model (classifier)
clf = RandomForestClassifier(
    n_estimators=100,       # number of trees in the forest
    random_state=42
)  

# 2️⃣ Train (fit) the model on the scaled training data
# (Random Forests are not sensitive to feature scaling, 
# but we use scaled data for consistency with other models)
clf.fit(X_train_scaled, y_train)  

# 3️⃣ Evaluate performance on the test set
# .score() gives the accuracy (fraction of correctly predicted samples)
print("Test Accuracy:", clf.score(X_test_scaled, y_test))

# 4️⃣ Test with a single instance
single_instance = X_test_scaled[3]       # scaled test sample
pred = clf.predict([single_instance])    # model's prediction
true = y_test[3]                         # actual ground-truth label

print("Prediction:", pred, " | True label:", true)
print("Class probabilities:", clf.predict_proba([single_instance]))
# If pred == true → model is correct
# If pred != true → model made a misclassification


Test Accuracy: 0.9298245614035088
Prediction: [0]  | True label: 0
Class probabilities: [[1. 0.]]


In [32]:
# 🔄 Apart from KNN, Logistic Regression, Decision Tree, SVC, and Random Forest, 
# we can also use GaussianNB (Gaussian Naive Bayes) — a probabilistic classifier 
# based on Bayes’ theorem, assuming features follow a normal (Gaussian) distribution.
# It is very fast, works well with high-dimensional data, and often used as a baseline model.

from sklearn.naive_bayes import GaussianNB  

# 1️⃣ Create the model (classifier)
clf = GaussianNB()  

# 2️⃣ Train (fit) the model on the training data
# (Naive Bayes does not require feature scaling, but we still use scaled data 
# for consistency across all models in this project)
clf.fit(X_train_scaled, y_train)  

# 3️⃣ Evaluate performance on the test set
# .score() gives the accuracy (fraction of correctly predicted samples)
print("Test Accuracy:", clf.score(X_test_scaled, y_test))

# 4️⃣ Test with a single instance
single_instance = X_test_scaled[3]       # scaled test sample
pred = clf.predict([single_instance])    # model's prediction
true = y_test[3]                         # actual ground-truth label

print("Prediction:", pred, " | True label:", true)
print("Class probabilities:", clf.predict_proba([single_instance]))
# If pred == true → model is correct
# If pred != true → model made a misclassification


Test Accuracy: 0.8947368421052632
Prediction: [0]  | True label: 0
Class probabilities: [[1.00000000e+000 1.41067133e-180]]


In [35]:
# 📊 Compare KNN, Logistic Regression, Decision Tree, SVC, Random Forest, and GaussianNB Classifiers
# We'll train all 6 on the same dataset and print confusion matrices + reports

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

# 1️⃣ Initialize all classifiers in a dictionary (EACH HAVE ITS OWN HYPER-PARAMETER, CAN TEST LATER)
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=5000),  # higher max_iter for convergence
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVC": SVC(kernel='rbf', probability=True, random_state=42),  # Support Vector Classifier
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),  # Ensemble of decision trees
    "GaussianNB": GaussianNB()  # Probabilistic classifier based on Bayes' theorem
}

# 2️⃣ Train, predict, and report for each model
for name, clf in models.items():
    print(f"\n=== {name} ===")
    
    # Train (fit) on training data
    clf.fit(X_train_scaled, y_train)
    
    # Predict on test data
    y_pred = clf.predict(X_test_scaled)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)
    
    # Classification report (precision, recall, F1, accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))


# 📌 HOW TO READ CONFUSION MATRIX
# Confusion Matrix Format:
# [[TN  FP]
#  [FN  TP]]

# TN (True Negative)  = correctly predicted class 0 (benign predicted benign)
# FP (False Positive) = predicted 1 but actually 0 (benign predicted malignant)
# FN (False Negative) = predicted 0 but actually 1 (malignant predicted benign)
# TP (True Positive)  = correctly predicted class 1 (malignant predicted malignant)

# 👉 Precision = % of positive predictions that were actually correct  
# 👉 Recall    = % of actual positives correctly identified  
# 👉 F1-Score  = balance between precision & recall  
# 👉 Accuracy  = overall % correct


=== KNN ===
Confusion Matrix:
 [[34  3]
 [ 0 77]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        37
           1       0.96      1.00      0.98        77

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114


=== Logistic Regression ===
Confusion Matrix:
 [[34  3]
 [ 2 75]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93        37
           1       0.96      0.97      0.97        77

    accuracy                           0.96       114
   macro avg       0.95      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


=== Decision Tree ===
Confusion Matrix:
 [[33  4]
 [ 4 73]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0