**Downloading the Dataset from Kaggle**

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/breast-cancer-wisconsin-data


**Loading the Dataframe**

In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

# Inspect the data
print(data.head())
print(data.info())
print(data.describe())
print(data['diagnosis'].value_counts())  # Check class distribution

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

**Dropping the unwanted columns**

In [4]:
data = data.drop(['Unnamed: 32', 'id'], axis=1)

**Encoding Target Variable**

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])

**Drop Collinear Features**

In [6]:
import numpy as np

corr_matrix = data.drop('diagnosis', axis=1).corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

data = data.drop(to_drop, axis=1)
print(f"Dropped features due to high correlation: {to_drop}")

Dropped features due to high correlation: ['perimeter_mean', 'area_mean', 'concave points_mean', 'perimeter_se', 'area_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'concave points_worst']


**Remove outliers**

In [7]:
import numpy as np

def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

numerical_cols = data.drop('diagnosis', axis=1).columns
data = remove_outliers(data, numerical_cols)

**Normalize numerical columns**

In [8]:
from sklearn.preprocessing import StandardScaler

X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

**Data Split**

In [9]:
from sklearn.model_selection import train_test_split

# First split: 70% train, 30% temp (for validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Second split: 50% of temp for validation, 50% for test (15% each of total)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

Train shape: (241, 20), Validation shape: (52, 20), Test shape: (52, 20)


**Apply SMOTE to training data**

In [10]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"Class distribution after SMOTE: {pd.Series(y_train_smote).value_counts()}")

Class distribution after SMOTE: diagnosis
1    183
0    183
Name: count, dtype: int64


**Random Forest Classifier**

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)

rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_smote, y_train_smote)

rf_best = rf_grid.best_estimator_
print(f"Best Random Forest Parameters: {rf_grid.best_params_}")

Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


**XGBoost Classifier**

In [13]:
from xgboost import XGBClassifier

xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

xgb = XGBClassifier(random_state=42, eval_metric='logloss')

xgb_grid = GridSearchCV(xgb, xgb_param_grid, cv=5, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train_smote, y_train_smote)

xgb_best = xgb_grid.best_estimator_
print(f"Best XGBoost Parameters: {xgb_grid.best_params_}")

Best XGBoost Parameters: {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}


**Performance Evaluation**

In [14]:
from sklearn.metrics import classification_report, roc_auc_score

def evaluate_model(model, X_val, y_val, X_test, y_test):
    y_val_pred = model.predict(X_val)
    val_f1 = f1_score(y_val, y_val_pred)
    val_roc_auc = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

    y_test_pred = model.predict(X_test)
    test_f1 = f1_score(y_test, y_test_pred)
    test_roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    print(f"Validation F1 Score: {val_f1:.4f}, Validation AUC-ROC: {val_roc_auc:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}, Test AUC-ROC: {test_roc_auc:.4f}")
    print("\nClassification Report (Test Set):")
    print(classification_report(y_test, y_test_pred))

print("Random Forest Evaluation:")
evaluate_model(rf_best, X_val, y_val, X_test, y_test)

print("\nXGBoost Evaluation:")
evaluate_model(xgb_best, X_val, y_val, X_test, y_test)

Random Forest Evaluation:
Validation F1 Score: 0.7692, Validation AUC-ROC: 0.9586
Test F1 Score: 0.7500, Test AUC-ROC: 0.9240

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        40
           1       0.75      0.75      0.75        12

    accuracy                           0.88        52
   macro avg       0.84      0.84      0.84        52
weighted avg       0.88      0.88      0.88        52


XGBoost Evaluation:
Validation F1 Score: 0.8571, Validation AUC-ROC: 0.9665
Test F1 Score: 0.7407, Test AUC-ROC: 0.9604

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.95      0.88      0.91        40
           1       0.67      0.83      0.74        12

    accuracy                           0.87        52
   macro avg       0.81      0.85      0.82        52
weighted avg       0.88      0.87      0.87        52



Using XGBoost as the primary model due to its higher recall for the malignant class and better AUC-ROC, which are critical for breast cancer diagnosis.

**Threshold Tuning**

In [17]:
from sklearn.metrics import precision_recall_curve
import numpy as np

y_probs = xgb_best.predict_proba(X_test)[:, 1]

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Find threshold that balances precision and recall
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]
print(f"Optimal threshold: {optimal_threshold}")

y_pred_custom = (y_probs >= optimal_threshold).astype(int)
print(classification_report(y_test, y_pred_custom))

Optimal threshold: 0.8887889385223389
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        40
           1       0.83      0.83      0.83        12

    accuracy                           0.92        52
   macro avg       0.89      0.89      0.89        52
weighted avg       0.92      0.92      0.92        52

