<a href="https://colab.research.google.com/github/ghost5053/dsml-course/blob/main/Supervised_mini_project_(classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")


In [3]:
# Check for missing values
missing_values = X.isnull().sum().sum()

# Handle missing values (if any)
if missing_values > 0:
    X.fillna(X.mean(), inplace=True)  # Impute missing values with the column mean


In [4]:
# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


#Many machine learning algorithms, such as SVMs and logistic regression, are sensitive to the scale of features. Standardization ensures that the optimization algorithm converges faster and prevents features with large magnitudes from dominating the model.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

# Accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr:.2f}")


Logistic Regression Accuracy: 0.97


 # Simple and interpretable, works well for linearly separable data.


In [6]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)

# Accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt:.2f}")


Decision Tree Accuracy: 0.95


#Captures non-linear patterns and is easy to understand.

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

# Accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")


Random Forest Accuracy: 0.96


#Improves over decision trees by reducing overfitting and boosting accuracy.

In [8]:
from sklearn.svm import SVC

# Support Vector Machine
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)

# Accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Support Vector Machine Accuracy: {accuracy_svm:.2f}")


Support Vector Machine Accuracy: 0.96


#Effective for high-dimensional data and ensures a clear margin of separation.

In [9]:
from sklearn.neighbors import KNeighborsClassifier

# k-Nearest Neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)

# Accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"k-Nearest Neighbors Accuracy: {accuracy_knn:.2f}")


k-Nearest Neighbors Accuracy: 0.95


# A lazy learner, works well with datasets having well-separated classes.

In [10]:
# Collect accuracies
model_accuracies = {
    "Logistic Regression": accuracy_lr,
    "Decision Tree": accuracy_dt,
    "Random Forest": accuracy_rf,
    "Support Vector Machine": accuracy_svm,
    "k-Nearest Neighbors": accuracy_knn,
}

# Find the best and worst models
best_model = max(model_accuracies, key=model_accuracies.get)
worst_model = min(model_accuracies, key=model_accuracies.get)

# Display comparison
print("Model Performance Comparison:")
for model, accuracy in model_accuracies.items():
    print(f"{model}: Accuracy = {accuracy:.2f}")

print(f"\nBest Performing Model: {best_model} (Accuracy = {model_accuracies[best_model]:.2f})")
print(f"Worst Performing Model: {worst_model} (Accuracy = {model_accuracies[worst_model]:.2f})")


Model Performance Comparison:
Logistic Regression: Accuracy = 0.97
Decision Tree: Accuracy = 0.95
Random Forest: Accuracy = 0.96
Support Vector Machine: Accuracy = 0.96
k-Nearest Neighbors: Accuracy = 0.95

Best Performing Model: Logistic Regression (Accuracy = 0.97)
Worst Performing Model: Decision Tree (Accuracy = 0.95)


###Best Performing Model: Logistic Regression (Accuracy = 0.97)
###Worst Performing Model: Decision Tree (Accuracy = 0.95)