<a href="https://colab.research.google.com/github/freak-dev/Machine-Learning-Lab-Exp./blob/main/exp2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

In [4]:
# 1. Install and Import the Library
!pip install ucimlrepo -q
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def run_assignment_pipeline(model_type='logistic'):
    # 2. Fetch dataset
    print("Fetching dataset from UCI...")
    dataset = fetch_ucirepo(id=17)

    # 3. Data (as pandas dataframes)
    X = dataset.data.features
    y = dataset.data.targets

    # Clean target: Convert 'M'/'B' to 1/0 if necessary
    # ucimlrepo usually returns 'M' and 'B' in the target dataframe
    y = y.iloc[:, 0].map({'M': 1, 'B': 0})

    # 4. Split into train/test (80/20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 5. Standardize/Normalize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 6. Model Training
    if model_type == 'logistic':
        model = LogisticRegression()
        name = "Logistic Regression (Baseline)"
    else:
        model = DecisionTreeClassifier(random_state=42)
        name = "Decision Tree (Non-Linear)"

    model.fit(X_train_scaled, y_train)

    # 7. Predictions
    train_preds = model.predict(X_train_scaled)
    test_preds = model.predict(X_test_scaled)

    # 8. Evaluation Metrics
    train_err = 1 - accuracy_score(y_train, train_preds)
    test_err = 1 - accuracy_score(y_test, test_preds)

    print(f"\n--- {name} Results ---")
    print(f"Train Error: {train_err:.4f}")
    print(f"Test Error:  {test_err:.4f}")
    print(f"Generalization Gap: {abs(test_err - train_err):.4f}")
    print("-" * 30)
    print(f"Accuracy:  {accuracy_score(y_test, test_preds):.4f}")
    print(f"Precision: {precision_score(y_test, test_preds):.4f}")
    print(f"Recall:    {recall_score(y_test, test_preds):.4f}")
    print(f"F1-score:  {f1_score(y_test, test_preds):.4f}")

    # 9. Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, test_preds)
    cm_df = pd.DataFrame(cm, index=['Actual Benign', 'Actual Malignant'],
                         columns=['Predicted Benign', 'Predicted Malignant'])
    print(cm_df)

    return name, train_err, test_err

# Run both models to compare
log_results = run_assignment_pipeline(model_type='logistic')
dt_results = run_assignment_pipeline(model_type='tree')

Fetching dataset from UCI...

--- Logistic Regression (Baseline) Results ---
Train Error: 0.0132
Test Error:  0.0263
Generalization Gap: 0.0131
------------------------------
Accuracy:  0.9737
Precision: 0.9762
Recall:    0.9535
F1-score:  0.9647

Confusion Matrix:
                  Predicted Benign  Predicted Malignant
Actual Benign                   70                    1
Actual Malignant                 2                   41
Fetching dataset from UCI...

--- Decision Tree (Non-Linear) Results ---
Train Error: 0.0000
Test Error:  0.0526
Generalization Gap: 0.0526
------------------------------
Accuracy:  0.9474
Precision: 0.9302
Recall:    0.9302
F1-score:  0.9302

Confusion Matrix:
                  Predicted Benign  Predicted Malignant
Actual Benign                   68                    3
Actual Malignant                 3                   40
