In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import sklearn.model_selection
import sklearn.preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, roc_auc_score, confusion_matrix, classification_report
)
import joblib
import warnings
warnings.filterwarnings('ignore')

In [82]:
# Load the dataset
df = pd.read_csv("data.csv")


In [25]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())


Missing Values:
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst

In [60]:
# Manual column-by-column approach
print("\n=== MANUAL COLUMN-BY-COLUMN IMPUTATION ===")

for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:  # Numerical columns only
        if df[col].isnull().sum() > 0:
            mean_value = df[col].mean()
            missing_count = df[col].isnull().sum()
            df[col].fillna(mean_value, inplace=True)
           # df[col] = df[col].method(value)
            print(f" {col}: {missing_count} values replaced with mean {mean_value:.4f}")

print("Manual imputation completed!")


=== MANUAL COLUMN-BY-COLUMN IMPUTATION ===
 Unnamed: 32: 569 values replaced with mean nan
Manual imputation completed!


In [64]:
# Feature Scaling Implementation
# StandardScaler (Z-score normalization)

print("=== FEATURE SCALING IMPLEMENTATION ===")
# Step 1: CREATE the encoded column first
df['diagnosis_encoded'] = df['diagnosis'].map({'M': 1, 'B': 0})
# Step 1: Prepare features and target
print("STEP 1: Dataset Preparation")
X = df.drop(['diagnosis', 'diagnosis_encoded'], axis=1)
if 'id' in X.columns:
    X = X.drop('id', axis=1)
y = df['diagnosis_encoded']  # Encoded target (0/1)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature columns: {list(X.columns[:5])}... (showing first 5)")

=== FEATURE SCALING IMPLEMENTATION ===
STEP 1: Dataset Preparation
Features shape: (569, 31)
Target shape: (569,)
Feature columns: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']... (showing first 5)


In [65]:
# Step 2: Analyze feature scales BEFORE scaling
print("\nSTEP 2: Feature Statistics BEFORE Scaling")
print("Key statistics for first 4 features:")
stats_before = X.describe()
print(stats_before.iloc[:, :4].round(2))

print(f"\nFeature ranges (min to max):")
for col in X.columns[:4]:
    print(f"  {col}: {X[col].min():.2f} to {X[col].max():.2f}")


STEP 2: Feature Statistics BEFORE Scaling
Key statistics for first 4 features:
       radius_mean  texture_mean  perimeter_mean  area_mean
count       569.00        569.00          569.00     569.00
mean         14.13         19.29           91.97     654.89
std           3.52          4.30           24.30     351.91
min           6.98          9.71           43.79     143.50
25%          11.70         16.17           75.17     420.30
50%          13.37         18.84           86.24     551.10
75%          15.78         21.80          104.10     782.70
max          28.11         39.28          188.50    2501.00

Feature ranges (min to max):
  radius_mean: 6.98 to 28.11
  texture_mean: 9.71 to 39.28
  perimeter_mean: 43.79 to 188.50
  area_mean: 143.50 to 2501.00


In [73]:
# Step 3: Train-Test Split (BEFORE scaling to prevent data leakage)
print("\nSTEP 3: Train-Test Split (Before Scaling)")
train_test_split = sklearn.model_selection.train_test_split
StandardScaler = sklearn.preprocessing.StandardScaler
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training target distribution: {np.bincount(y_train)}")
print(f"Test target distribution: {np.bincount(y_test)}")


STEP 3: Train-Test Split (Before Scaling)
Training set: 455 samples
Test set: 114 samples
Training target distribution: [285 170]
Test target distribution: [72 42]


In [85]:
X_train.isnull().sum()
X_train.nunique()

radius_mean                375
texture_mean               396
perimeter_mean             426
area_mean                  436
smoothness_mean            399
compactness_mean           438
concavity_mean             431
concave points_mean        434
symmetry_mean              360
fractal_dimension_mean     413
radius_se                  434
texture_se                 423
perimeter_se               430
area_se                    431
smoothness_se              443
compactness_se             435
concavity_se               427
concave points_se          414
symmetry_se                405
fractal_dimension_se       441
radius_worst               373
texture_worst              419
perimeter_worst            424
area_worst                 442
smoothness_worst           349
compactness_worst          430
concavity_worst            431
concave points_worst       404
symmetry_worst             411
fractal_dimension_worst    433
Unnamed: 32                  0
dtype: int64

In [87]:
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())


In [91]:
# Identify zero-variance columns again
zero_var_cols = X_train.columns[X_train.nunique() <= 1]

print("Zero-variance columns:", list(zero_var_cols))

# Drop them from both train and test
X_train = X_train.drop(columns=zero_var_cols)
X_test = X_test.drop(columns=zero_var_cols)

Zero-variance columns: ['Unnamed: 32']


In [102]:
from sklearn.preprocessing import StandardScaler
# Step 4 Feature scaling
scaler_standard = StandardScaler()
scaler_standard.fit(X_train)

X_train_scaled = scaler_standard.transform(X_train)
X_test_scaled = scaler_standard.transform(X_test)
# Convert to data frames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_train.columns)
print(" StandardScaler applied successfully!")

 StandardScaler applied successfully!


In [103]:
# Step 6: Scaling Verification
print("\nSTEP 6: Scaling Verification")
print("Scaled training data statistics (should be ~0 mean, ~1 std):")
print("First 3 features:")
verification_stats = X_train_scaled_df.iloc[:, :3].describe()
print(verification_stats.round(4))


STEP 6: Scaling Verification
Scaled training data statistics (should be ~0 mean, ~1 std):
First 3 features:
       radius_mean  texture_mean  perimeter_mean
count     455.0000      455.0000        455.0000
mean       -0.0000        0.0000          0.0000
std         1.0011        1.0011          1.0011
min        -2.0097       -2.2650         -1.9614
25%        -0.6870       -0.7193         -0.6878
50%        -0.2311       -0.1208         -0.2445
75%         0.4948        0.5628          0.4975
max         3.9002        4.6343          3.8997


In [105]:
means_close_to_zero = np.allclose(
    X_train_scaled_df.mean(), 0, atol=1e-10
)

stds_close_to_one = np.allclose(
    X_train_scaled_df.std(ddof=0), 1, atol=1e-10
)

print(f"Means close to 0: {means_close_to_zero}")
print(f"Standard deviations close to 1: {stds_close_to_one}")

Means close to 0: True
Standard deviations close to 1: True


In [107]:
# Step 8: Save All Processed Data
print("\nSTEP 8: Saving Processed Data")

# Save StandardScaler results (recommended for most models)
X_train_scaled_df.to_csv('X_train_standardscaled.csv', index=False)
X_test_scaled_df.to_csv('X_test_standardscaled.csv', index=False)

# Save MinMaxScaler results ONLY if they exist
if 'X_train_minmax_df' in globals() and 'X_test_minmax_df' in globals():
    X_train_minmax_df.to_csv('X_train_minmaxscaled.csv', index=False)
    X_test_minmax_df.to_csv('X_test_minmaxscaled.csv', index=False)
    print("MinMaxScaler data saved")
else:
    print("MinMaxScaler data not found — skipping MinMax save")

# Save target variables
pd.Series(y_train).to_csv('y_train.csv', index=False, header=['target'])
pd.Series(y_test).to_csv('y_test.csv', index=False, header=['target'])

# Save original unscaled data (for tree-based models)
X_train.to_csv('X_train_original.csv', index=False)
X_test.to_csv('X_test_original.csv', index=False)

print("All available datasets saved successfully!")



STEP 8: Saving Processed Data
MinMaxScaler data not found — skipping MinMax save
All available datasets saved successfully!


In [109]:
# Save original unscaled data (for tree-based models)
X_train.to_csv('X_train_original.csv', index=False)
X_test.to_csv('X_test_original.csv', index=False)

print(" All files saved:")
print("  StandardScaler:")
print("    - X_train_standardscaled.csv")
print("    - X_test_standardscaled.csv")
print("  MinMaxScaler:")
print("    - X_train_minmaxscaled.csv") 
print("    - X_test_minmaxscaled.csv")
print("  Target variables:")
print("    - y_train.csv")
print("    - y_test.csv")
print("  Original (unscaled):")
print("    - X_train_original.csv")
print("    - X_test_original.csv")

 All files saved:
  StandardScaler:
    - X_train_standardscaled.csv
    - X_test_standardscaled.csv
  MinMaxScaler:
    - X_train_minmaxscaled.csv
    - X_test_minmaxscaled.csv
  Target variables:
    - y_train.csv
    - y_test.csv
  Original (unscaled):
    - X_train_original.csv
    - X_test_original.csv


In [112]:
# Step 9: Assignment Requirements Verification
print("\nSTEP 9: Assignment Requirements Verification")
print(f" Features: {X.shape[1]} (requirement: ≥12)")
print(f" Total instances: {X.shape[0]} (requirement: ≥500)")
print(f" Training instances: {X_train.shape[0]}")
print(f" Test instances: {X_test.shape[0]}")
print(f" Target encoded: Binary (0,1)")
print(f" Features scaled: Both StandardScaler and MinMaxScaler")
print(f" Data leakage prevented: Scalers fit on training data only")

print("\n FEATURE SCALING COMPLETED SUCCESSFULLY!")
print("\n=== READY FOR MODEL IMPLEMENTATION ===")


STEP 9: Assignment Requirements Verification
 Features: 31 (requirement: ≥12)
 Total instances: 569 (requirement: ≥500)
 Training instances: 455
 Test instances: 114
 Target encoded: Binary (0,1)
 Features scaled: Both StandardScaler and MinMaxScaler
 Data leakage prevented: Scalers fit on training data only

 FEATURE SCALING COMPLETED SUCCESSFULLY!

=== READY FOR MODEL IMPLEMENTATION ===


In [3]:
INPUT_CSV = "data.csv"            
DROP_MISSING_ROWS = False         # True: drop rows with any NaNs; False: mean imputation
LOGREG_SOLVER = "liblinear"       # "liblinear" or "lbfgs"
LOGREG_MAX_ITER = 1000
USE_CLASS_WEIGHT_BALANCED = True  # helps if classes slightly imbalanced
TEST_SIZE = 0.2
RANDOM_STATE = 42

In [12]:
# 1) Load features
X_train = pd.read_csv('X_train_standardscaled.csv')
X_test  = pd.read_csv('X_test_standardscaled.csv')

In [13]:
# 2) Load targets (handles both cases: with header 'target' or no header)
def load_target(path):
    df = pd.read_csv(path)
    if 'target' in df.columns:
        return df['target'].astype(int)
    else:
        # fall back to the first column
        return df.iloc[:, 0].astype(int)

y_train = load_target('y_train.csv')
y_test  = load_target('y_test.csv')


In [14]:
# Sanity checks
print("Shapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_test:  {X_test.shape},  y_test:  {y_test.shape}")


Shapes:
  X_train: (455, 30), y_train: (455,)
  X_test:  (114, 30),  y_test:  (114,)


In [15]:
# Ensure no NaNs slipped through
if X_train.isnull().sum().sum() > 0 or X_test.isnull().sum().sum() > 0:
    raise ValueError("NaNs found in X_train/X_test. Please re-run preprocessing.")
if y_train.isnull().sum() > 0 or y_test.isnull().sum() > 0:
    raise ValueError("NaNs found in y_train/y_test. Please re-run preprocessing.")


In [16]:
# 3) Train Logistic Regression
log_reg = LogisticRegression(
    solver='lbfgs',       # or 'liblinear'; lbfgs works well with scaled numeric features
    max_iter=1000,
    class_weight='balanced',  # helps if classes are slightly imbalanced
    n_jobs=-1
)
log_reg.fit(X_train, y_train)

In [17]:
# 4) Predictions & probabilities
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]


In [18]:
# 5) Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("\n=== Logistic Regression Performance (StandardScaled features) ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"MCC:       {mcc:.4f}")
print(f"AUC:       {auc:.4f}")



=== Logistic Regression Performance (StandardScaled features) ===
Accuracy:  0.9737
Precision: 0.9756
Recall:    0.9524
F1 Score:  0.9639
MCC:       0.9433
AUC:       0.9954


In [19]:
# 6) Confusion Matrix & Classification Report
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

# 7) Save model
joblib.dump(log_reg, 'logreg_model.pkl')
print("\nSaved: logreg_model.pkl")



Confusion Matrix:
[[71  1]
 [ 2 40]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9726    0.9861    0.9793        72
           1     0.9756    0.9524    0.9639        42

    accuracy                         0.9737       114
   macro avg     0.9741    0.9692    0.9716       114
weighted avg     0.9737    0.9737    0.9736       114


Saved: logreg_model.pkl
