In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
"""
CSCI218 Group Project: Dry Bean Dataset Classification (3 models, fixed CV)
==========================================================================
Fixes vs typical template:
- Uses sklearn Pipeline for scaling (prevents CV leakage)
- Trains exactly 3 models: Logistic Regression, SVM-RBF, Random Forest
- Uses macro-F1 for fair multi-class evaluation
- Selects best model by CV score (not by test)
"""

import os
import warnings
import numpy as np
import pandas as pd

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

# ============================================================
# Configuration
# ============================================================
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.2
N_SPLITS = 5

# ============================================================
# 1. Load Dataset
# ============================================================
print("=" * 60)
print("CSCI218 Group Project: Dry Bean Classification (3 models)")
print("=" * 60)

X, y = None, None

try:
    from ucimlrepo import fetch_ucirepo
    print("\n[1] Loading Dry Bean dataset from UCI ML Repository (ucimlrepo)...")
    dataset = fetch_ucirepo(id=602)
    X = dataset.data.features
    y = dataset.data.targets.values.ravel()
    print("    Loaded successfully via ucimlrepo.")
except Exception:
    local_path = os.path.join(BASE_DIR, "Dry_Bean_Dataset.csv")
    if os.path.exists(local_path):
        print(f"\n[1] Loading Dry Bean dataset from local CSV: {local_path}")
        df = pd.read_csv(local_path)
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1].values
    else:
        raise FileNotFoundError(
            "Could not load dataset via ucimlrepo and local CSV not found.\n"
            "Place Dry_Bean_Dataset.csv next to this script, or install ucimlrepo."
        )

print(f"    Samples: {X.shape[0]}, Features: {X.shape[1]}")
print(f"    Classes: {np.unique(y)}")

# ============================================================
# 2. Basic checks + optional simple plot
# ============================================================
print("\n[2] Basic dataset checks...")

# Missing values
missing = int(pd.DataFrame(X).isnull().sum().sum())
print(f"    Missing values: {missing}")
if missing > 0:
    X = pd.DataFrame(X).fillna(pd.DataFrame(X).median())
    print("    Filled missing values with median.")

# Class distribution plot (simple + useful)
class_counts = pd.Series(y).value_counts().sort_index()
fig, ax = plt.subplots(figsize=(9, 5))
ax.bar(class_counts.index.astype(str), class_counts.values, edgecolor="black")
ax.set_title("Class Distribution (Dry Bean)")
ax.set_xlabel("Class")
ax.set_ylabel("Count")
plt.xticks(rotation=25, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=150)
plt.close()
print("    Saved: class_distribution.png")

# ============================================================
# 3. Encode labels + Train/Test split
# ============================================================
print("\n[3] Preprocessing...")

le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_
print(f"    Encoded classes: {dict(zip(class_names, range(len(class_names))))}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_encoded
)
print(f"    Train: {X_train.shape[0]} samples")
print(f"    Test:  {X_test.shape[0]} samples")

# ============================================================
# 4. Models (3 only) + Cross-validation (NO leakage)
# ============================================================
print("\n[4] Training + Cross-validation (no leakage via Pipeline)...")

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=3000, random_state=RANDOM_STATE))
    ]),
    "SVM (RBF)": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", C=10, gamma="scale", random_state=RANDOM_STATE))
    ]),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
}

results = {}

for name, model in models.items():
    print(f"\n  Model: {name}")

    # CV score on TRAIN ONLY (macro-F1 is better for multi-class fairness)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="f1_macro")

    # Fit on full training set, then evaluate on test set ONCE
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average="macro")
    f1_weighted = f1_score(y_test, y_pred, average="weighted")
    prec_macro = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec_macro = recall_score(y_test, y_pred, average="macro", zero_division=0)

    results[name] = {
        "cv_f1_macro_mean": float(cv_scores.mean()),
        "cv_f1_macro_std": float(cv_scores.std()),
        "test_accuracy": float(acc),
        "test_f1_macro": float(f1_macro),
        "test_f1_weighted": float(f1_weighted),
        "test_precision_macro": float(prec_macro),
        "test_recall_macro": float(rec_macro),
        "y_pred": y_pred,
        "model": model
    }

    print(f"    CV macro-F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"    Test Acc:   {acc:.4f}")
    print(f"    Test F1 (macro):    {f1_macro:.4f}")
    print(f"    Test F1 (weighted): {f1_weighted:.4f}")

# Choose best model by CV macro-F1 (more correct than choosing by test)
best_model_name = max(results, key=lambda k: results[k]["cv_f1_macro_mean"])
best = results[best_model_name]
print("\n" + "=" * 60)
print(f"BEST MODEL (by CV macro-F1): {best_model_name}")
print(f"CV macro-F1: {best['cv_f1_macro_mean']:.4f}")
print(f"Test Acc:    {best['test_accuracy']:.4f}")
print(f"Test macro-F1: {best['test_f1_macro']:.4f}")
print("=" * 60)

# ============================================================
# 5. Save confusion matrix + classification report (best model)
# ============================================================
print("\n[5] Saving best model evaluation outputs...")

cm = confusion_matrix(y_test, best["y_pred"])
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(cm)
ax.set_title(f"Confusion Matrix - {best_model_name}")
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
ax.set_xticks(range(len(class_names)))
ax.set_yticks(range(len(class_names)))
ax.set_xticklabels(class_names, rotation=30, ha="right")
ax.set_yticklabels(class_names)

# annotate cells
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, str(cm[i, j]), ha="center", va="center", fontsize=8)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix_best.png"), dpi=150)
plt.close()
print("    Saved: confusion_matrix_best.png")

report = classification_report(y_test, best["y_pred"], target_names=class_names, zero_division=0)
with open(os.path.join(OUTPUT_DIR, "classification_report_best.txt"), "w") as f:
    f.write(f"Best Model (by CV macro-F1): {best_model_name}\n")
    f.write(f"CV macro-F1 mean: {best['cv_f1_macro_mean']:.4f} (std {best['cv_f1_macro_std']:.4f})\n")
    f.write(f"Test Accuracy: {best['test_accuracy']:.4f}\n")
    f.write(f"Test macro-F1: {best['test_f1_macro']:.4f}\n")
    f.write(f"Test weighted-F1: {best['test_f1_weighted']:.4f}\n\n")
    f.write(report)
print("    Saved: classification_report_best.txt")

# ============================================================
# 6. Save results table CSV
# ============================================================
print("\n[6] Saving results table...")

summary_df = pd.DataFrame([
    {
        "Model": name,
        "CV_MacroF1_Mean": r["cv_f1_macro_mean"],
        "CV_MacroF1_Std": r["cv_f1_macro_std"],
        "Test_Accuracy": r["test_accuracy"],
        "Test_Precision_Macro": r["test_precision_macro"],
        "Test_Recall_Macro": r["test_recall_macro"],
        "Test_F1_Macro": r["test_f1_macro"],
        "Test_F1_Weighted": r["test_f1_weighted"],
    }
    for name, r in results.items()
])

summary_csv_path = os.path.join(OUTPUT_DIR, "results_summary_3models.csv")
summary_df.to_csv(summary_csv_path, index=False)
print(f"    Saved: results_summary_3models.csv")

print(f"\nAll outputs saved to: {OUTPUT_DIR}")
print("Done!")


CSCI218 Group Project: Dry Bean Classification (3 models)

[1] Loading Dry Bean dataset from UCI ML Repository (ucimlrepo)...
    Loaded successfully via ucimlrepo.
    Samples: 13611, Features: 16
    Classes: ['BARBUNYA' 'BOMBAY' 'CALI' 'DERMASON' 'HOROZ' 'SEKER' 'SIRA']

[2] Basic dataset checks...
    Missing values: 0
    Saved: class_distribution.png

[3] Preprocessing...
    Encoded classes: {'BARBUNYA': 0, 'BOMBAY': 1, 'CALI': 2, 'DERMASON': 3, 'HOROZ': 4, 'SEKER': 5, 'SIRA': 6}
    Train: 10888 samples
    Test:  2723 samples

[4] Training + Cross-validation (no leakage via Pipeline)...

  Model: Logistic Regression
    CV macro-F1: 0.9353 (+/- 0.0046)
    Test Acc:   0.9207
    Test F1 (macro):    0.9329
    Test F1 (weighted): 0.9209

  Model: SVM (RBF)
    CV macro-F1: 0.9443 (+/- 0.0057)
    Test Acc:   0.9243
    Test F1 (macro):    0.9361
    Test F1 (weighted): 0.9243

  Model: Random Forest
    CV macro-F1: 0.9355 (+/- 0.0052)
    Test Acc:   0.9210
    Test F1 (macro)

In [None]:

# 2b boxplot of top features by class

print("\n[2b] Creating feature boxplot...")

#convert to dataframe for easier plotting
df_X = pd.DataFrame(X)
df_X['Class'] = y

#select a few key features to avoid overcrowding
#choose features from different categories
selected_features = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
                     'AspectRatio', 'Eccentricity', 'Roundness', 'Compactness']

#check which features actually exist in the dataset
available_features = [f for f in selected_features if f in df_X.columns]
if len(available_features) < len(selected_features):
    #if not matching, use first 8 columns
    available_features = df_X.columns[:8]

#create boxplot
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.ravel()

for idx, feature in enumerate(available_features[:8]):  #8 features limitation
    df_X.boxplot(column=feature, by='Class', ax=axes[idx])
    axes[idx].set_title(f'{feature} by Class')
    axes[idx].set_xlabel('Bean Class')
    axes[idx].tick_params(axis='x', rotation=45)

plt.suptitle('Feature Distributions Across Bean Classes', fontsize=16)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "feature_boxplots.png"), dpi=150, bbox_inches='tight')
plt.close()
print("    Saved: feature_boxplots.png")

#alternative: single large boxplot of standardized features
print("\n[2c] Creating standardized feature boxplot...")

#standardize features for comparison
from sklearn.preprocessing import StandardScaler
df_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
df_scaled['Class'] = y

#melt dataframe for seaborn (if available)
try:
    import seaborn as sns
    df_melted = df_scaled.melt(id_vars=['Class'], var_name='Feature', value_name='Value')

    plt.figure(figsize=(14, 8))
    sns.boxplot(data=df_melted, x='Feature', y='Value', hue='Class')
    plt.xticks(rotation=45, ha='right')
    plt.title('Standardized Features Distribution by Class')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "standardized_boxplots.png"), dpi=150, bbox_inches='tight')
    plt.close()
    print("    Saved: standardized_boxplots.png")
except ImportError:
    print("    Seaborn not available, skipping standardized boxplot")


[2b] Creating feature boxplot...
    Saved: feature_boxplots.png

[2c] Creating standardized feature boxplot...
    Saved: standardized_boxplots.png
