# Linear Classification for Breast Cancer Diagnosis

This notebook performs linear classification on the preprocessed breast cancer dataset using Logistic Regression.


## Load Preprocessed Data


In [10]:
import pandas as pd

train_path = "../../data/processed/breast-cancer-preprocessed-train.csv"
test_path  = "../../data/processed/breast-cancer-preprocessed-test.csv"
sol_path   = "../../data/processed/breast-cancer-preprocessed-sol.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)
sol_df   = pd.read_csv(sol_path)  


## Prepare Features and Target


In [23]:
target_col = "class"
id_col = "ID"

X_train = train_df.drop(columns=[target_col, id_col])
y_train = train_df[target_col]

X_test = test_df.drop(columns=[id_col])


## Linear Classifier (Logistic Regression)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np

# Train logistic regression classifier on full dataset
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"Number of iterations: {logistic_model.n_iter_[0]}")


Model trained successfully!
Number of iterations: 16


## Cross-Validation Evaluation


In [None]:
# Cross-Validation Evaluation
cv_model = LogisticRegression(random_state=42, max_iter=1000)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(cv_model, X_train, y_train, cv=5, scoring='accuracy')

print("Cross-Validation Results:")
print(f"CV Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"Standard Deviation: {cv_scores.std():.4f}")


## Holdout Validation


In [None]:
# Holdout Validation
# Split training data into train and validation sets (80/20 split)
X_train_holdout, X_val_holdout, y_train_holdout, y_val_holdout = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Train model on training portion
holdout_model = LogisticRegression(random_state=42, max_iter=1000)
holdout_model.fit(X_train_holdout, y_train_holdout)

# Evaluate on validation set
y_val_pred = holdout_model.predict(X_val_holdout)
val_accuracy = accuracy_score(y_val_holdout, y_val_pred)

print("Holdout Validation Results:")
print(f"Training set size: {X_train_holdout.shape[0]}")
print(f"Validation set size: {X_val_holdout.shape[0]}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val_holdout, y_val_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val_holdout, y_val_pred))
