
# Assignment-05 — LightGBM and SVM classifiers (Completed)

**Objective:** Build, tune, evaluate, compare, and save LightGBM and SVM classifiers using cross-validation and GridSearchCV.

**Notes:** The notebook uses `sklearn.datasets.load_breast_cancer` by default. The code will try to import `lightgbm`; if it's not installed, it will automatically use `sklearn.ensemble.HistGradientBoostingClassifier` as a strong alternative.


In [None]:
# Step 1: Import Libraries and Load Data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
import joblib

# Try to import lightgbm; fallback to HistGradientBoostingClassifier if not available
try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except Exception as e:
    from sklearn.ensemble import HistGradientBoostingClassifier
    LGB_AVAILABLE = False

print('LightGBM available:', LGB_AVAILABLE)

# Load dataset
data = load_breast_cancer(as_frame=True)
X = data.frame.drop(columns=[data.target.name])
y = data.frame[data.target.name]

print('Data shape:', X.shape, 'Target distribution:')
print(y.value_counts(normalize=True))

In [None]:
# Step 2: Prepare Features and Target
# Quick preprocessing plan:
# - Numeric columns: impute median, then scale
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print('Numeric columns count:', len(numeric_cols))

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols)
])

# Train-test split (we will tune on training set and evaluate on test set)
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

In [None]:
# Step 3: LightGBM (or fallback) + GridSearchCV
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

if LGB_AVAILABLE:
    # Use LightGBM scikit-learn API
    model = lgb.LGBMClassifier(random_state=RANDOM_STATE)
    param_grid = {
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.1, 0.01],
        'model__max_depth': [-1, 6]
    }
else:
    # Fallback to HistGradientBoostingClassifier
    from sklearn.ensemble import HistGradientBoostingClassifier
    model = HistGradientBoostingClassifier(random_state=RANDOM_STATE)
    param_grid = {
        'model__max_iter': [100, 200],
        'model__learning_rate': [0.1, 0.05],
        'model__max_depth': [None, 10]
    }

pipeline_lgb = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

grid_lgb = GridSearchCV(pipeline_lgb, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_lgb.fit(X_train, y_train)

print('Best params (LightGBM/fallback):', grid_lgb.best_params_)
print('Best CV ROC-AUC:', grid_lgb.best_score_)

In [None]:
# Step 4: SVM + GridSearchCV
pipeline_svc = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])

param_grid_svc = {
    'model__C': [0.1, 1, 10],
    'model__kernel': ['rbf', 'linear'],
    'model__gamma': ['scale', 'auto']
}

grid_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_svc.fit(X_train, y_train)

print('Best params (SVM):', grid_svc.best_params_)
print('Best CV ROC-AUC (SVM):', grid_svc.best_score_)

In [None]:
# Step 5: Evaluate Best Models on Test Set and Save
def evaluate_model(grid, X_test, y_test, name='model'):
    best = grid.best_estimator_
    y_pred = best.predict(X_test)
    y_proba = best.predict_proba(X_test)[:,1] if hasattr(best, 'predict_proba') else None
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    print(f'---- {name} evaluation ----')
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall:', rec)
    print('F1:', f1)
    if roc is not None:
        print('ROC-AUC:', roc)
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print('\nClassification report:\n', classification_report(y_test, y_pred))
    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': roc}

results = {}
results['lightgbm_or_fallback'] = evaluate_model(grid_lgb, X_test, y_test, name='LightGBM_or_Fallback')
results['svm'] = evaluate_model(grid_svc, X_test, y_test, name='SVM')

# Compare side by side
results_df = pd.DataFrame(results).T
print('\nSummary comparison:')
display(results_df)

# Save best models
lgb_model_path = '/mnt/data/assignment05_best_lgb_model.joblib'
svc_model_path = '/mnt/data/assignment05_best_svc_model.joblib'
joblib.dump(grid_lgb.best_estimator_, lgb_model_path)
joblib.dump(grid_svc.best_estimator_, svc_model_path)
print('Saved LightGBM/fallback model to', lgb_model_path)
print('Saved SVM model to', svc_model_path)


## Notes & Next steps

- If you want the notebook to use a CSV you uploaded, place it in `/mnt/data/` and modify the data loading cell to `pd.read_csv('/mnt/data/yourfile.csv')`.
- You can reduce GridSearchCV parameter grid sizes to speed up runs, or use `RandomizedSearchCV` for larger spaces.
- If LightGBM is not installed locally but you prefer it, install it via `pip install lightgbm` in your environment.
