# HW2 Playground

Fill in TODOs as you work through the assignment.
Implement the required sections in `model.py`, and use this notebook to orchestrate and run your solution.

In [71]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from hw2_loader import HW2DataLoader
from model import GradientBoostingModel
from sklearn.preprocessing import LabelEncoder

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
# TODO: Load both datasets
loader = HW2DataLoader()

# Heart disease dataset
heart_path = Path('../data/heart.csv')
X_heart, y_heart = loader.get_heart_disease_data(csv_path=heart_path)
print(X_heart.shape, y_heart.value_counts().to_dict())

# Cancer genomics dataset
cancer_path = Path('../data/cancer_genomics.csv')
labels_path = Path('../data/labels_cancer_genomics.csv')
X_cancer, y_cancer = loader.get_cancer_genomics_data(
    csv_path=cancer_path, labels_path=labels_path
)
print(X_cancer.shape, y_cancer.value_counts().to_dict())

Successfully loaded heart disease data with 1025 rows
(1025, 13) {1: 526, 0: 499}
(801, 5479) {'BRCA': 300, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136, 'COAD': 78}


In [73]:
# TODO: Initialize your model (adjust params)
model = GradientBoostingModel(
    task='classification',
    max_depth= 3,
    learning_rate= 0.1,
    n_estimators= 50,
    subsample= 1,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    random_state=42,
    use_scaler=True,
)

In [74]:
# TODO: Train/test split + fit (heart)
print('Before dedup X:', X_heart.duplicated().sum())
print('Before dedup y:', y_heart.duplicated().sum())
#TODO ask about deduplicating 
duplicated = X_heart.duplicated()
X_heart = X_heart[~duplicated]
y_heart = y_heart[~duplicated]
print('After dedup X:', X_heart.duplicated().sum())
print('After dedup y:', y_heart.duplicated().sum())


print('Check for NA \n', X_heart.isna().sum())
X_train, X_test, y_train, y_test = model.train_test_split(X_heart, y_heart, random_state=42)
model.fit(X_train, y_train)


Before dedup X: 723
Before dedup y: 1023
After dedup X: 0
After dedup y: 300
Check for NA 
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64


<model.GradientBoostingModel at 0x13da893d0>

In [75]:
# TODO: Evaluate (heart)
metrics = model.evaluate(X_test, y_test)
print(metrics)

{'accuracy': 0.7868852459016393, 'precision': 0.7352941176470589, 'recall': 0.8620689655172413, 'f1': 0.7936507936507936, 'roc_auc': 0.8459051724137931}


In [76]:
# TODO: Cross-validation (heart)
cv_results = model.cross_validate(X_heart, y_heart)
print(cv_results)

{'accuracy': {'mean': np.float64(0.8110928961748634), 'std': np.float64(0.05683811425746767)}, 'precision': {'mean': np.float64(0.8044289044289045), 'std': np.float64(0.05482653147641297)}, 'recall': {'mean': np.float64(0.866098484848485), 'std': np.float64(0.05238925308098828)}, 'f1': {'mean': np.float64(0.8334242086415999), 'std': np.float64(0.04755082348585735)}, 'roc_auc': {'mean': np.float64(0.892242364117364), 'std': np.float64(0.044243684238676334)}}


In [77]:
# TODO: Feature importance (heart)
feature_importance = model.get_feature_importance(plot=False)
print(feature_importance.head(10))


          Importance
cp          0.271179
thal        0.152293
oldpeak     0.132348
ca          0.103459
age         0.072785
thalach     0.065468
trestbps    0.055822
chol        0.053773
sex         0.038690
slope       0.023003


In [78]:
# TODO: Hyperparameter tuning (heart)
param_grid = {
    "max_depth": [3, 5, 10, 15, 20, 25],
    "learning_rate": [0.001, 0.01, 0.05, 0.07, 0.1],
    "n_estimators": [50, 75, 100, 125, 150, 175],
    "subsample":[0.25, 0.4, 0.5, 0.6, 0.75, 1.0],
    "min_samples_split":[2, 4, 6, 8, 10, 12],
    "min_samples_leaf":[1, 2, 3,4, 5,6],   
}
tuning_results = model.tune_hyperparameters(X_heart, y_heart, param_grid, cv=3)
print(tuning_results['best_params'])
print(tuning_results['best_score'])


{'learning_rate': 0.01, 'max_depth': 15, 'min_samples_leaf': 6, 'min_samples_split': 6, 'n_estimators': 50, 'subsample': 0.25}
0.9178865954228272


In [82]:
hyper_param_tuned_model = GradientBoostingModel(max_depth=15, learning_rate=0.01, n_estimators=50, subsample=.25, min_samples_split=6,min_samples_leaf=6, random_state=42, use_scaler=True)
hyper_param_tuned_model = hyper_param_tuned_model.fit(X_train, y_train)

metrics = model.evaluate(X_test, y_test)
print("====Metrics===")
print(metrics)

print("===Cross Val===")
cv_results = model.cross_validate(X_heart, y_heart)
print(cv_results)

print("===Feature Importance===")
feature_importance = model.get_feature_importance(plot=False)
print(feature_importance)

====Metrics===
{'accuracy': 0.7868852459016393, 'precision': 0.7352941176470589, 'recall': 0.8620689655172413, 'f1': 0.7936507936507936, 'roc_auc': 0.8459051724137931}
===Cross Val===
{'accuracy': {'mean': np.float64(0.8110928961748634), 'std': np.float64(0.05683811425746767)}, 'precision': {'mean': np.float64(0.8044289044289045), 'std': np.float64(0.05482653147641297)}, 'recall': {'mean': np.float64(0.866098484848485), 'std': np.float64(0.05238925308098828)}, 'f1': {'mean': np.float64(0.8334242086415999), 'std': np.float64(0.04755082348585735)}, 'roc_auc': {'mean': np.float64(0.892242364117364), 'std': np.float64(0.044243684238676334)}}
===Feature Importance===
          Importance
cp          0.271179
thal        0.152293
oldpeak     0.132348
ca          0.103459
age         0.072785
thalach     0.065468
trestbps    0.055822
chol        0.053773
sex         0.038690
slope       0.023003
restecg     0.022784
exang       0.007975
fbs         0.000420


In [80]:
# TODO: Train/evaluate on cancer dataset (multi-class)
cancer_model = GradientBoostingModel(
    task='classification',
    max_depth= 3,
    learning_rate= 0.1,
    n_estimators= 50,
    subsample= 1,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    multiclass=True,
    random_state=42,
    use_scaler=True,
)
print('Before dedup X:', X_cancer.duplicated().sum())
print('Before dedup y:', y_cancer.duplicated().sum())
#TODO ask about deduplicating 
duplicated = X_cancer.duplicated()
X_cancer = X_cancer[~duplicated]
y_cancer = y_cancer[~duplicated]
print('After dedup X:', X_cancer.duplicated().sum())
print('After dedup y:', y_cancer.duplicated().sum())


print('Check for NA \n', X_cancer.isna().sum().sum())

print('Class balance: \n', y_heart.value_counts())
print('Dimensionality X: ', X_cancer.shape)
print('Dimensionality y: ', y_cancer.shape)
print('X features: ', len(X_cancer.columns))

X_cancer.describe()

encoder = LabelEncoder()
y_cancer = encoder.fit_transform(y_cancer)

Before dedup X: 0
Before dedup y: 796
After dedup X: 0
After dedup y: 796
Check for NA 
 0
Class balance: 
 target
1    164
0    138
Name: count, dtype: int64
Dimensionality X:  (801, 5479)
Dimensionality y:  (801,)
X features:  5479


In [84]:
#Training and eval logic for cancer model ()
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = cancer_model.train_test_split(X_cancer, y_cancer, test_size=0.2, random_state=42)
cancer_model = cancer_model.fit(X_train_cancer, y_train_cancer)

print("===Metrics===")
cancer_metrics = cancer_model.evaluate(X_test_cancer, y_test_cancer)
print(cancer_metrics)

print("===Cross Val===")
cancer_cross_val = cancer_model.cross_validate(X_cancer, y_cancer)
print(cancer_cross_val)

===Metrics===
{'accuracy': 0.9937888198757764, 'precision': 0.9967741935483871, 'recall': 0.993103448275862, 'f1': 0.9948652118100128, 'roc_auc': 0.9999149665107833}
===Cross Val===
{'accuracy': {'mean': np.float64(0.986273291925466), 'std': np.float64(0.006104854691187139)}, 'precision_macro': {'mean': np.float64(0.991531772226752), 'std': np.float64(0.00326352479973478)}, 'recall_macro': {'mean': np.float64(0.9833128078817734), 'std': np.float64(0.008229881945071774)}, 'f1_macro': {'mean': np.float64(0.9870345257966233), 'std': np.float64(0.005876334713557647)}, 'roc_auc_ovr': {'mean': np.float64(0.9998028291878669), 'std': np.float64(0.0002334600515504683)}}


In [None]:
# Hyperparameter tuning (Cancer)
param_grid = {
    "max_depth": [3, 5, 10, 15],
    "learning_rate": [0.001, 0.01, 0.05, 0.1],
    "n_estimators": [50, 75, 100, 125],
    "subsample":[0.25, 0.5, 0.75, 1.0],
    "min_samples_split":[2, 4, 6, 8],
    "min_samples_leaf":[1,2, 4 ,6]
}
tuning_results = model.tune_hyperparameters(X_cancer, y_cancer, param_grid, cv=3, scoring='roc_auc_ovr')
print(tuning_results['best_params'])
print(tuning_results['best_score'])