In [35]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [37]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

## Preprocessing the Data

In [44]:
if isinstance(y, pd.DataFrame):
    # If y is a DataFrame
    if y.iloc[:, 0].dtype == 'object':
        y = LabelEncoder().fit_transform(y.iloc[:, 0])
else:
    # If y is a Series
    if y.dtype == 'object':
        y = LabelEncoder().fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Define Splits and Initialize Results

In [45]:
splits = [(0.2, 0.8), (0.5, 0.5), (0.8, 0.2)]
trials = 3  # Number of trials
results = {}

for train_size, test_size in splits:
    split_results = {clf_name: [] for clf_name in ["Random Forest", "Gradient Boosting", "Logistic Regression"]}

    # Perform multiple trials
    for trial in range(trials):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, train_size=train_size, test_size=test_size, random_state=42 + trial
        )

        # Define classifiers
        classifiers = {
            "Random Forest": RandomForestClassifier(n_estimators=100),
            "Gradient Boosting": GradientBoostingClassifier(),
            "Logistic Regression": LogisticRegression(max_iter=1000, solver='saga')
        }

        # Train and evaluate classifiers
        for clf_name, clf in classifiers.items():
            # Cross-validation for hyperparameter tuning
            cv_scores = cross_val_score(clf, X_train, y_train, cv=5)

            # Train the classifier
            clf.fit(X_train, y_train)

            # Evaluate on the test set
            y_pred = clf.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred)

            # Store results for this trial
            split_results[clf_name].append({
                "CV Accuracy (Mean)": cv_scores.mean(),
                "CV Accuracy (Std)": cv_scores.std(),
                "Test Accuracy": test_accuracy
            })

    # Average results across trials
    results[f"Train {int(train_size*100)}% / Test {int(test_size*100)}%"] = {
        clf_name: {
            metric: np.mean([trial_result[metric] for trial_result in metrics])
            for metric in metrics[0]
        }
        for clf_name, metrics in split_results.items()
    }

## Display Results

In [46]:
for split, split_results in results.items():
    print(f"Results for {split} split:")
    for clf_name, metrics in split_results.items():
        print(f"  {clf_name}:")
        for metric, value in metrics.items():
            print(f"    {metric}: {value:.4f}")
    print("\n")

Results for Train 20% / Test 80% split:
  Random Forest:
    CV Accuracy (Mean): 0.9200
    CV Accuracy (Std): 0.0487
    Test Accuracy: 0.9620
  Gradient Boosting:
    CV Accuracy (Mean): 0.9112
    CV Accuracy (Std): 0.0402
    Test Accuracy: 0.9152
  Logistic Regression:
    CV Accuracy (Mean): 0.9704
    CV Accuracy (Std): 0.0243
    Test Accuracy: 0.9649


Results for Train 50% / Test 50% split:
  Random Forest:
    CV Accuracy (Mean): 0.9495
    CV Accuracy (Std): 0.0284
    Test Accuracy: 0.9637
  Gradient Boosting:
    CV Accuracy (Mean): 0.9505
    CV Accuracy (Std): 0.0292
    Test Accuracy: 0.9602
  Logistic Regression:
    CV Accuracy (Mean): 0.9777
    CV Accuracy (Std): 0.0167
    Test Accuracy: 0.9743


Results for Train 80% / Test 20% split:
  Random Forest:
    CV Accuracy (Mean): 0.9531
    CV Accuracy (Std): 0.0184
    Test Accuracy: 0.9737
  Gradient Boosting:
    CV Accuracy (Mean): 0.9531
    CV Accuracy (Std): 0.0166
    Test Accuracy: 0.9708
  Logistic Regressio