In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import uniform
import numpy as np

In [2]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier()
}

In [4]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }
results_df = pd.DataFrame(results).T
print("Baseline Models:\n", results_df)

Baseline Models:
                      Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.973684   0.972222  0.985915  0.979021
Random Forest        0.964912   0.958904  0.985915  0.972222
SVM                  0.973684   0.972222  0.985915  0.979021
Decision Tree        0.947368   0.957746  0.957746  0.957746


In [5]:
#gridsearch cv
param_grid_rf = {
    "n_estimators": [50, 100, 150],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, scoring='f1', cv=5)
grid_rf.fit(X_train, y_train)

In [6]:
# RandomizedSearchCV
param_dist_svm = {
    "C": uniform(0.1, 10),
    "gamma": ["scale", "auto"],
    "kernel": ["rbf", "linear"]
}
rand_svm = RandomizedSearchCV(SVC(), param_distributions=param_dist_svm, n_iter=20, scoring='f1', cv=5, random_state=42)
rand_svm.fit(X_train, y_train)

In [7]:
# Evaluate
def evaluate(model, name):
    y_pred = model.predict(X_test)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

tuned_results = pd.DataFrame([
    evaluate(grid_rf.best_estimator_, "Tuned Random Forest"),
    evaluate(rand_svm.best_estimator_, "Tuned SVM")
]).set_index("Model")

In [8]:
final_results = pd.concat([results_df, tuned_results])
print("\nAll Model Comparison:\n", final_results.sort_values("F1 Score", ascending=False))


All Model Comparison:
                      Accuracy  Precision    Recall  F1 Score
Tuned SVM            0.982456   0.972603  1.000000  0.986111
Logistic Regression  0.973684   0.972222  0.985915  0.979021
SVM                  0.973684   0.972222  0.985915  0.979021
Random Forest        0.964912   0.958904  0.985915  0.972222
Tuned Random Forest  0.956140   0.958333  0.971831  0.965035
Decision Tree        0.947368   0.957746  0.957746  0.957746
