# HW2 Playground

Fill in TODOs as you work through the assignment.
Implement the required sections in `model.py`, and use this notebook to orchestrate and run your solution.

In [29]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from hw2_loader import HW2DataLoader
from model import GradientBoostingModel

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# TODO: Load both datasets
loader = HW2DataLoader()

# Heart disease dataset
heart_path = Path('../data/heart.csv')
X_heart, y_heart = loader.get_heart_disease_data(csv_path=heart_path)
print(X_heart.shape, y_heart.value_counts().to_dict())

# Cancer genomics dataset
cancer_path = Path('../data/cancer_genomics.csv')
labels_path = Path('../data/labels_cancer_genomics.csv')
X_cancer, y_cancer = loader.get_cancer_genomics_data(
    csv_path=cancer_path, labels_path=labels_path
)
print(X_cancer.shape, y_cancer.value_counts().to_dict())


Successfully loaded heart disease data with 1025 rows
(1025, 13) {1: 526, 0: 499}
(801, 5479) {'BRCA': 300, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136, 'COAD': 78}


# Heart Data

In [3]:
# #PREPROCESSING

# #Outliers
# from scipy import stats
# outliers = set()
# for col in X_heart.columns:
#     z = np.abs(stats.zscore(X_heart[col]))
#     outlier_threshold = 3
#     outlier_inds = np.where(z>outlier_threshold)[0]
#     print(col, "number of outliers:", len(outlier_inds))
#     outliers.update(outlier_inds)

# shape_before = X_heart.shape
# X_heart = X_heart.drop(outliers)
# y_heart = y_heart.drop(outliers)
# shape_after = X_heart.shape

# print('number of datapoints removed:', shape_before[0]-shape_after[0])

In [4]:
# TODO: Initialize your model (adjust params)
model = GradientBoostingModel(
    task='classification',
    max_depth = 5,
    learning_rate = 0.05,
    n_estimators = 100,
    subsample=1,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=7,
    random_state=42,
    use_scaler=True
)


In [5]:
# TODO: Train/test split + fit (heart)
X_train, X_test, y_train, y_test = model.train_test_split(X_heart, y_heart)
model.fit(X_train, y_train, False)


In [6]:
# TODO: Evaluate (heart)
# metrics = model.evaluate(...)
# print metrics
metrics = model.evaluate(X_test, y_test)
print(metrics)

{'accuracy': 0.9853658536585366, 'precision': 1.0, 'recall': 0.970873786407767, 'f1': 0.9852216748768473, 'roc_auc': 0.9922901199314678}


In [7]:
# TODO: Cross-validation (heart)
# cv_results = model.cross_validate(...)
# print metrics
cv_results = model.cross_validate(X_heart, y_heart)
cv_df = pd.DataFrame(cv_results)
cv_df.index  = ['mean', 'stdev']
cv_df

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
mean,0.992195,0.988837,0.996226,0.992488,0.999314
stdev,0.010045,0.013672,0.007547,0.009667,0.001372


In [8]:
# TODO: Feature importance (heart)
# model.get_feature_importance(plot=False)
feature_importance = model.get_feature_importance(False)
print('TOP FEATURES VIA GRADIENT BOOSTING')
print(feature_importance.head(10))

# k-feature selection via variance (HW1 method)
vars = np.var(X_heart, axis=0)
vars = vars.sort_values(ascending=False)
print('\n')
print('TOP FEATURES VIA LOGISTIC REGRESSION & VARIANCE')
print(vars.head(10))

TOP FEATURES VIA GRADIENT BOOSTING
   features  importance
0        cp    0.208119
1      thal    0.151756
2        ca    0.143845
3   oldpeak    0.121860
4   thalach    0.084146
5      chol    0.072650
6       age    0.071359
7  trestbps    0.053220
8     slope    0.028025
9       sex    0.026639


TOP FEATURES VIA LOGISTIC REGRESSION & VARIANCE
chol        2659.190244
thalach      528.746971
trestbps     306.536058
age           82.226151
oldpeak        1.379403
ca             1.061507
cp             1.059126
thal           0.384843
slope          0.381249
restecg        0.278383
dtype: float64


In [9]:
# TODO: Hyperparameter tuning (heart)
param_grid = {
    'max_depth': [7,9,11],
    'n_estimators': [25,50,100],
    'learning_rate': [0.1, 0.2, 0.3],
    # 'min_samples_split': [1,2,3],
    # 'min_samples_leaf': [1,2,3],
    # 'max_features': [5,10,15],
}
tuning_results = model.tune_hyperparameters(X_heart, y_heart, param_grid, cv=3)
print(tuning_results['best_params'])
print(tuning_results['best_score'])


{'learning_rate': 0.3, 'max_depth': 11, 'n_estimators': 50}
0.9969827025654526


In [14]:
# table
cv_results = model.tune_hyperparameters(X_heart, y_heart, param_grid, cv=3, plot=True)
df_results = pd.DataFrame(cv_results['params'])
df_results['mean test score'] = cv_results['mean_test_score']
df_results

{'mean_fit_time': array([0.11778514, 0.26092498, 0.4506096 , 0.11336533, 0.26480707,
       0.55795829, 0.07329297, 0.18065381, 0.36694034, 0.08959492,
       0.17311152, 0.37658572, 0.1003627 , 0.21290692, 0.43152642,
       0.07025663, 0.15745823, 0.29036776, 0.10916011, 0.17707149,
       0.29108922, 0.09298563, 0.24615669, 0.22590629, 0.08555055,
       0.20856587, 0.18309259]), 'std_fit_time': array([0.01158767, 0.04417386, 0.01122523, 0.00628312, 0.01689657,
       0.03371379, 0.00270198, 0.02890978, 0.05077984, 0.00456445,
       0.00995787, 0.074104  , 0.0045043 , 0.00975378, 0.02951961,
       0.00323784, 0.0105459 , 0.04581484, 0.02852937, 0.01919901,
       0.05503234, 0.00644394, 0.04000142, 0.00715169, 0.0139707 ,
       0.05900687, 0.01598167]), 'mean_score_time': array([0.00439024, 0.00552567, 0.00417479, 0.        , 0.00477759,
       0.01413925, 0.01055153, 0.00855573, 0.        , 0.        ,
       0.00011023, 0.01063251, 0.        , 0.01072431, 0.00741855,
       0.0

# Cancer

In [10]:
# VISUALIZING DATA

print('Number of features:', len(X_cancer.columns))
print("Class balance:", y_cancer.value_counts().to_dict())
#it's probably wise to do stratification?

print('Number of missing values:', sum(X_cancer.isna().sum()))

Number of features: 5479
Class balance: {'BRCA': 300, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136, 'COAD': 78}
Number of missing values: 0


In [11]:
# # PREPROCESSING

# #OUTLIERS
# outliers = set()
# for col in X_cancer.columns:
#     z = np.abs(stats.zscore(X_cancer[col]))
#     outlier_threshold = 3
#     outlier_inds = np.where(z>outlier_threshold)[0]
#     outliers.update(outlier_inds)

# shape_before = X_cancer.shape
# X_cancer = X_cancer.drop(outliers)
# y_cancer = y_cancer.drop(outliers)
# shape_after = X_cancer.shape

# print('number of datapoints removed:', shape_before[0]-shape_after[0])

In [12]:
# TODO: Train/evaluate on cancer dataset (multi-class)
# cancer_model = GradientBoostingModel(...)
# cancer_model.train_test_split(...)
# fit
# evaluate 
# print metrics

In [13]:
cancer_model = GradientBoostingModel(
    task='classification',
    max_depth = 5,
    learning_rate = 0.05,
    n_estimators = 50,
    subsample=1,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=7,
    random_state=42,
    use_scaler=True
)
X_train_c, X_test_c, y_train_c, y_test_c = cancer_model.train_test_split(X_cancer, y_cancer)
cancer_model.fit(X_train_c, y_train_c, False)
cancer_metrics = cancer_model.evaluate(X_test_c, y_test_c)
print(cancer_metrics)

## Compare w/ Hw1