<a href="https://colab.research.google.com/github/debashisdotchatterjee/biopsy-breast-cancer-ML-1/blob/main/biopsy_breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pydataset



In [None]:
pip install pandas numpy matplotlib seaborn scikit-learn statsmodels




In [None]:
from pydataset import data

# Load the biopsy dataset
biopsy_data = data('biopsy')

# Display the first few rows
print(biopsy_data.head())


        ID  V1  V2  V3  V4  V5    V6  V7  V8  V9   class
1  1000025   5   1   1   1   2   1.0   3   1   1  benign
2  1002945   5   4   4   5   7  10.0   3   2   1  benign
3  1015425   3   1   1   1   2   2.0   3   1   1  benign
4  1016277   6   8   8   1   3   4.0   3   7   1  benign
5  1017023   4   1   1   3   2   1.0   3   1   1  benign


In [None]:
pip install pydataset statsmodels seaborn




In [None]:
# Import Necessary Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, roc_curve, accuracy_score,
                             precision_score, recall_score, f1_score)
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from scipy.stats import chi2

# Suppress warnings for clean output
import warnings
warnings.filterwarnings('ignore')

# Create Results Directory
results_dir = 'results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# 1. Data Loading and Preprocessing
# -----------------------------------

from pydataset import data

# Load the biopsy dataset
biopsy_data = data('biopsy')

# Display initial data overview
print("Initial Data Overview:")
print(biopsy_data.head())

# Handle missing values in V6 (Bare Nuclei) using Iterative Imputer (MICE)
# Convert V6 to numeric, coercing errors to NaN
biopsy_data['V6'] = pd.to_numeric(biopsy_data['V6'], errors='coerce')

# Initialize IterativeImputer
imputer = IterativeImputer(random_state=42)
X_imputed = pd.DataFrame(imputer.fit_transform(biopsy_data.drop(['class', 'ID'], axis=1)),
                         columns=biopsy_data.drop(['class', 'ID'], axis=1).columns)

# Reattach the target variable
X_imputed['class'] = biopsy_data['class'].values

# Encode the target variable: 'benign' = 0, 'malignant' = 1
X_imputed['class'] = X_imputed['class'].map({'benign': 0, 'malignant': 1})

# Verify no missing values remain
print("\nMissing Values After Imputation:")
print(X_imputed.isnull().sum())

# Split features and target
X = X_imputed.drop(['class'], axis=1)
y = X_imputed['class']

# Feature Names
feature_names = X.columns.tolist()

# 2. Exploratory Data Analysis (EDA)
# -----------------------------------

# Descriptive Statistics
descriptive_stats = X.describe().T
descriptive_stats['Variance'] = X.var()
descriptive_stats['Coefficient of Variation'] = descriptive_stats['std'] / descriptive_stats['mean']
descriptive_stats = descriptive_stats[['mean', '50%', 'std', 'Variance', 'Coefficient of Variation']]
descriptive_stats.rename(columns={'50%': 'Median'}, inplace=True)

# Save Descriptive Statistics Table
descriptive_stats.to_csv(os.path.join(results_dir, 'descriptive_statistics.csv'))
print("\nDescriptive Statistics:")
print(descriptive_stats)

# Visualizations Directory
viz_dir = os.path.join(results_dir, 'visualizations')
if not os.path.exists(viz_dir):
    os.makedirs(viz_dir)

# Histograms and Density Plots
for feature in feature_names:
    plt.figure(figsize=(10, 6))
    sns.histplot(data=X_imputed, x=feature, hue='class', kde=True, stat="density", common_norm=False, palette='coolwarm')
    plt.title(f'Distribution of {feature} by Class')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend(['Benign', 'Malignant'])
    plt.tight_layout()
    plt.savefig(os.path.join(viz_dir, f'histogram_density_{feature}.png'))
    plt.close()

# Pairwise Scatter Plot Matrix
sns.pairplot(X_imputed, vars=feature_names, hue='class', palette='coolwarm', diag_kind='kde')
plt.suptitle('Pairwise Scatter Plot Matrix', y=1.02)
plt.tight_layout()
plt.savefig(os.path.join(viz_dir, 'pairwise_scatter_matrix.png'))
plt.close()

# Correlation Heatmap
plt.figure(figsize=(12, 10))
corr = X.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Heatmap of Features')
plt.tight_layout()
plt.savefig(os.path.join(viz_dir, 'correlation_heatmap.png'))
plt.close()

# 3. Data Transformation
# ------------------------

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)

# Box-Cox Transformation for skewed features
# Box-Cox requires all data to be positive, check and adjust if necessary
pt = PowerTransformer(method='box-cox', standardize=False)

# Check for features with non-positive values
positive_features = X_scaled.columns[(X_scaled > 0).all()]
skewed_features = X_scaled[positive_features].skew().abs()
skewed_features = skewed_features[skewed_features > 1].index.tolist()

# Apply Box-Cox transformation to skewed features
if skewed_features:
    X_scaled[skewed_features] = pt.fit_transform(X_scaled[skewed_features])
    print(f"\nBox-Cox Transformation applied to: {skewed_features}")

# 4. Feature Variability Measures
# --------------------------------

# Compute Variance and Coefficient of Variation for each feature
variance = X.var()
cv = variance / X.mean()

# Note: Variance and CV are per-feature statistics and should not be added as per-sample columns.
# Instead, they can be used separately for analysis.

# Save Variability Measures
variability_measures = pd.DataFrame({'Feature': feature_names,
                                    'Variance': variance.values,
                                    'Coefficient of Variation': cv.values})
variability_measures.to_csv(os.path.join(results_dir, 'feature_variability.csv'), index=False)
print("\nFeature Variability Measures:")
print(variability_measures)

# 5. Train-Test Split
# ---------------------

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

# 6. Objective 1: Modeling Non-Linear Relationships with Kernel Methods
# ------------------------------------------------------------------------

# Define the SVM pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42))
])

# Define hyperparameter grid for GridSearchCV
param_grid_svm = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [0.001, 0.01, 0.1, 1]
}

# Initialize GridSearchCV
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_svm = GridSearchCV(svm_pipeline,
                        param_grid=param_grid_svm,
                        cv=cv_strategy,
                        scoring='accuracy',
                        n_jobs=-1,
                        verbose=1)

# Fit the model
grid_svm.fit(X_train, y_train)

# Best parameters
best_params_svm = grid_svm.best_params_
print("\nBest Parameters for SVM:")
print(best_params_svm)

# Best estimator
best_svm = grid_svm.best_estimator_

# Predictions on Test Set
y_pred_svm = best_svm.predict(X_test)
y_proba_svm = best_svm.predict_proba(X_test)[:, 1]

# Evaluation Metrics
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
auc_svm = roc_auc_score(y_test, y_proba_svm)

# Confusion Matrix
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_svm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
plt.title('Confusion Matrix - SVM with RBF Kernel')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig(os.path.join(viz_dir, 'confusion_matrix_svm.png'))
plt.close()

# ROC Curve
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_proba_svm)
plt.figure(figsize=(6, 4))
plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {auc_svm:.2f})', color='blue')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve - SVM with RBF Kernel')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(os.path.join(viz_dir, 'roc_curve_svm.png'))
plt.close()

# Classification Report
report_svm = classification_report(y_test, y_pred_svm, target_names=['Benign', 'Malignant'])
print("\nClassification Report - SVM:")
print(report_svm)

# Save Classification Report
with open(os.path.join(results_dir, 'classification_report_svm.txt'), 'w') as f:
    f.write(report_svm)

# 7. Objective 2: Feature Selection with Lasso Regularization
# -------------------------------------------------------------

# Initialize Logistic Regression with L1 penalty (Lasso)
logreg_lasso = LogisticRegression(penalty='l1',
                                  solver='saga',
                                  max_iter=5000,
                                  random_state=42)

# Define hyperparameter grid for C (inverse of regularization strength)
param_grid_lasso = {
    'C': [0.01, 0.1, 1, 10, 100]
}

# Initialize GridSearchCV
grid_lasso = GridSearchCV(logreg_lasso,
                          param_grid=param_grid_lasso,
                          cv=cv_strategy,
                          scoring='accuracy',
                          n_jobs=-1,
                          verbose=1)

# Fit the model
grid_lasso.fit(X_train, y_train)

# Best parameters
best_params_lasso = grid_lasso.best_params_
print("\nBest Parameters for Lasso Logistic Regression:")
print(best_params_lasso)

# Best estimator
best_lasso = grid_lasso.best_estimator_

# Feature Selection: Non-zero coefficients
lasso_coef = pd.Series(best_lasso.coef_[0], index=X_train.columns)
selected_features = lasso_coef[lasso_coef != 0].index.tolist()
print("\nSelected Features by Lasso:")
print(selected_features)

# Predictions on Test Set
y_pred_lasso = best_lasso.predict(X_test)
y_proba_lasso = best_lasso.predict_proba(X_test)[:, 1]

# Evaluation Metrics
accuracy_lasso = accuracy_score(y_test, y_pred_lasso)
precision_lasso = precision_score(y_test, y_pred_lasso)
recall_lasso = recall_score(y_test, y_pred_lasso)
f1_lasso = f1_score(y_test, y_pred_lasso)
auc_lasso = roc_auc_score(y_test, y_proba_lasso)

# Confusion Matrix
conf_matrix_lasso = confusion_matrix(y_test, y_pred_lasso)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_lasso, annot=True, fmt='d', cmap='Greens', cbar=False,
            xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
plt.title('Confusion Matrix - Lasso Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig(os.path.join(viz_dir, 'confusion_matrix_lasso.png'))
plt.close()

# ROC Curve
fpr_lasso, tpr_lasso, _ = roc_curve(y_test, y_proba_lasso)
plt.figure(figsize=(6, 4))
plt.plot(fpr_lasso, tpr_lasso, label=f'Lasso Logistic Regression (AUC = {auc_lasso:.2f})', color='green')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve - Lasso Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(os.path.join(viz_dir, 'roc_curve_lasso.png'))
plt.close()

# Classification Report
report_lasso = classification_report(y_test, y_pred_lasso, target_names=['Benign', 'Malignant'])
print("\nClassification Report - Lasso Logistic Regression:")
print(report_lasso)

# Save Classification Report
with open(os.path.join(results_dir, 'classification_report_lasso.txt'), 'w') as f:
    f.write(report_lasso)

# Save Selected Features and Coefficients
lasso_features_df = lasso_coef[lasso_coef != 0].reset_index()
lasso_features_df.columns = ['Feature', 'Coefficient']
lasso_features_df.to_csv(os.path.join(results_dir, 'lasso_selected_features.csv'), index=False)

# 8. Objective 3: Assessing the Impact of Feature Variability
# ---------------------------------------------------------------

# Since Variance and CV are per-feature statistics, they cannot be directly added as per-sample features.
# Instead, we can analyze their relationship with feature importance or use them in a separate analysis.

# Example Approach:
# - Correlate the CV of features with their coefficients in the Lasso model to assess if more variable features are more predictive.

# Calculate correlation between CV and absolute Lasso coefficients
cv_values = variability_measures.set_index('Feature').loc[selected_features, 'Coefficient of Variation']
lasso_coefficients = lasso_features_df.set_index('Feature').loc[selected_features, 'Coefficient']

# Create a DataFrame for correlation analysis
correlation_df = pd.DataFrame({
    'CV': cv_values,
    'Lasso Coefficient': lasso_coefficients.abs()
})

# Compute correlation
correlation = correlation_df.corr().iloc[0, 1]
print(f"\nCorrelation between CV and Absolute Lasso Coefficients: {correlation:.2f}")

# Save Correlation Analysis
correlation_df.to_csv(os.path.join(results_dir, 'cv_vs_lasso_coefficients.csv'))
with open(os.path.join(results_dir, 'cv_vs_lasso_coefficients.txt'), 'w') as f:
    f.write(f"Correlation between CV and Absolute Lasso Coefficients: {correlation:.2f}\n")

# Alternatively, perform a regression analysis
# Regress Lasso coefficients on CV
X_corr = sm.add_constant(correlation_df['CV'])
model_corr = sm.OLS(correlation_df['Lasso Coefficient'], X_corr).fit()
print("\nRegression Analysis between CV and Absolute Lasso Coefficients:")
print(model_corr.summary())

# Save Regression Analysis Summary
with open(os.path.join(results_dir, 'regression_cv_lasso_summary.txt'), 'w') as f:
    f.write(model_corr.summary().as_text())

# 9. Comparison of Models
# -------------------------
# ... [Previous code remains the same up to the Regression Analysis in Objective 3] ...

# 9. Comparison of Models
# -------------------------

# Since we only have performance metrics for SVM and Lasso Logistic Regression,
# we will update the model comparison accordingly.

# Create a DataFrame to compare model performances
model_comparison = pd.DataFrame({
    'Model': ['SVM with RBF Kernel', 'Lasso Logistic Regression'],
    'Accuracy': [accuracy_svm, accuracy_lasso],
    'Precision': [precision_svm, precision_lasso],
    'Recall': [recall_svm, recall_lasso],
    'F1-Score': [f1_svm, f1_lasso],
    'AUC': [auc_svm, auc_lasso]
})

# Format metrics for display
model_comparison_display = model_comparison.copy()
model_comparison_display['Accuracy'] = model_comparison_display['Accuracy'].apply(lambda x: f"{x*100:.2f}%")
model_comparison_display['Precision'] = model_comparison_display['Precision'].apply(lambda x: f"{x*100:.2f}%")
model_comparison_display['Recall'] = model_comparison_display['Recall'].apply(lambda x: f"{x*100:.2f}%")
model_comparison_display['F1-Score'] = model_comparison_display['F1-Score'].apply(lambda x: f"{x*100:.2f}%")
model_comparison_display['AUC'] = model_comparison_display['AUC'].apply(lambda x: f"{x:.2f}")

# Save Model Comparison Table
model_comparison_display.to_csv(os.path.join(results_dir, 'model_comparison.csv'), index=False)

# Plot Model Comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
for metric in metrics:
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Model', y=metric, data=model_comparison, palette='viridis')
    plt.title(f'Comparison of Models: {metric}')
    plt.ylim(0, 1.1)
    plt.xlabel('Model')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(viz_dir, f'model_comparison_{metric.lower()}.png'))
    plt.close()

# 10. Save All Tables and Reports
# --------------------------------

# Save Descriptive Statistics
descriptive_stats.to_csv(os.path.join(results_dir, 'descriptive_statistics.csv'))

# Save Feature Variability Measures
variability_measures.to_csv(os.path.join(results_dir, 'feature_variability.csv'), index=False)

# Save Selected Features from Lasso
lasso_features_df.to_csv(os.path.join(results_dir, 'lasso_selected_features.csv'), index=False)

# Save Model Comparison Table
model_comparison.to_csv(os.path.join(results_dir, 'model_comparison_full.csv'), index=False)

# 11. Summary of Results
# ------------------------

# Print Model Comparison
print("\nModel Comparison:")
print(model_comparison_display)

# 12. Conclusion
# ----------------

print("\nAll models have been trained and evaluated. Results and visualizations have been saved in the 'results' folder.")


Initial Data Overview:
        ID  V1  V2  V3  V4  V5    V6  V7  V8  V9   class
1  1000025   5   1   1   1   2   1.0   3   1   1  benign
2  1002945   5   4   4   5   7  10.0   3   2   1  benign
3  1015425   3   1   1   1   2   2.0   3   1   1  benign
4  1016277   6   8   8   1   3   4.0   3   7   1  benign
5  1017023   4   1   1   3   2   1.0   3   1   1  benign

Missing Values After Imputation:
V1       0
V2       0
V3       0
V4       0
V5       0
V6       0
V7       0
V8       0
V9       0
class    0
dtype: int64

Descriptive Statistics:
        mean  Median       std   Variance  Coefficient of Variation
V1  4.417740     4.0  2.815741   7.928395                  0.637371
V2  3.134478     1.0  3.051459   9.311403                  0.973514
V3  3.207439     1.0  2.971913   8.832265                  0.926569
V4  2.806867     1.0  2.855379   8.153191                  1.017283
V5  3.216023     2.0  2.214300   4.903124                  0.688521
V6  3.526567     1.0  3.618696  13.094962    