In [134]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit,KFold,GridSearchCV
from sklearn.metrics import accuracy_score,fbeta_score,f1_score
import numpy as np
import matplotlib.pyplot as plt
from helpers import make_complexity_curve, make_learning_curve,make_timing_curve,make_timing_curve_fixed
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [125]:
adult_df = pd.read_csv(filepath_or_buffer="adult_cleaned.csv")
adult_df_dummies = pd.get_dummies(adult_df.drop('target', 1))
adult_x = adult_df_dummies.copy().values
adult_y = adult_df["target"].copy().values
X_adult_train, X_adult_test, y_adult_train, y_adult_test = train_test_split(adult_x, adult_y, test_size=0.2,random_state=0)
print(f"adult_df: {adult_df.shape}")
print(f"X_adult_train: {X_adult_train.shape}")
print(f"X_adult_test: {X_adult_test.shape}")
print(f"y_adult_train: {y_adult_train.shape}")
print(f"y_adult_test: {y_adult_test.shape}")
print(f"y_adult_train pos: {(y_adult_train > 0).sum() / len(y_adult_train)}")
print(f"y_adult_train neg: {(y_adult_train == 0).sum() / len(y_adult_train)}")
pipeAdult = Pipeline([('Scale',StandardScaler())])
X_trans_adult_train = pipeAdult.fit_transform(X_adult_train, y_adult_train)
print(f"X_tran_adult_train: {X_trans_adult_train.shape}")

adult_df: (30162, 13)
X_adult_train: (24129, 70)
X_adult_test: (6033, 70)
y_adult_train: (24129,)
y_adult_test: (6033,)
y_adult_train pos: 0.2489535413817398
y_adult_train neg: 0.7510464586182601
X_tran_adult_train: (24129, 70)


In [127]:
spam_df = pd.read_csv('spambase2.csv', header=0)
spam_x = spam_df.drop('class', axis=1).copy().values
spam_y = spam_df['class'].copy().values
X_spam_train, X_spam_test, y_spam_train, y_spam_test = train_test_split(spam_x, spam_y, test_size=0.2,random_state=0)
print(f"spam_df: {spam_df.shape}")
print(f"X_spam_train: {X_spam_train.shape}")
print(f"X_spam_test: {X_spam_test.shape}")
print(f"y_spam_train: {y_spam_train.shape}")
print(f"y_spam_test: {y_spam_test.shape}")
print(f"y_spam_train pos: {(y_spam_train > 0).sum() / len(y_spam_train)}")
print(f"y_spam_train neg: {(y_spam_train == 0).sum() / len(y_spam_train)}")
pipeSpam = Pipeline([('Scale',StandardScaler())])
X_trans_spam_train = pipeSpam.fit_transform(X_spam_train, y_spam_train)
print(f"X_tran_spam_train: {X_trans_spam_train.shape}")
print(f"Percentage feature removed: {1 - (X_trans_spam_train.shape[1] / X_spam_train.shape[1])}")

spam_df: (4601, 58)
X_spam_train: (3680, 57)
X_spam_test: (921, 57)
y_spam_train: (3680,)
y_spam_test: (921,)
y_spam_train pos: 0.38858695652173914
y_spam_train neg: 0.6114130434782609
X_tran_spam_train: (3680, 57)
Percentage feature removed: 0.0


In [130]:
adult_params = {
    'n_estimators': [1,2,5,10,20,30,45,60,80,100],
    'base_estimator__max_depth': list(np.arange(1, 30, 3)),
    'learning_rate': [0.01,0.05,0.1,0.3,1]
}

spam_params = {
    'n_estimators': [1,2,5,10,20,30,45,60,80,100],
    'base_estimator__max_depth': list(np.arange(1, 30, 3)),
    'learning_rate': [0.01,0.05,0.1,0.3,1]
}

In [142]:
adult_cv = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=123), random_state=123),param_grid=adult_params, cv=5)
adult_cv.fit(X_trans_adult_train, y_adult_train)
print(f"best params adult: {adult_cv.best_params_}")

best params adult: {'base_estimator__max_depth': 4, 'learning_rate': 0.3, 'n_estimators': 45}


In [143]:
spam_cv = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=123),random_state=123),param_grid=spam_params, cv=5)
spam_cv.fit(X_trans_spam_train, y_spam_train)
spam_cv.best_params_
print(f"best params spam: {spam_cv.best_params_}")

best params spam: {'base_estimator__max_depth': 22, 'learning_rate': 1, 'n_estimators': 45}


**Complexity**

In [146]:
adult_cc_estimators = make_complexity_curve(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=123), random_state=123), X_trans_adult_train.astype(float), y_adult_train.astype(float), "n_estimators", [1,2,5,10,20,30,45,60,80,100],"boosting complexity n_estimators", "adult")

AttributeError: module 'pandas.core.common' has no attribute '_dict_keys_to_ordered_list'

In [None]:
adult_cc_estimators.plot(title=adult_cc_estimators.name)

In [None]:
spam_cc_estimators = make_complexity_curve(spam_cv.best_estimator_, X_trans_spam_train.astype(float), y_spam_train.astype(float), "n_estimators", [1,2,5,10,20,30,45,60,80,100],"boosting complexity n_estimators", "spam")

In [None]:
spam_cc_estimators.plot(title=spam_cc_estimators.name)

In [None]:
adult_cc_learning_rate = make_complexity_curve(adult_cv.best_estimator_, X_trans_adult_train.astype(float), y_adult_train.astype(float), "learning_rate", [0.01,0.05,0.1,0.3,1],"boosting complexity learning rate", "adult")

In [None]:
adult_cc_learning_rate.plot(title=adult_cc_learning_rate.name)

In [None]:
spam_cc_learning_rate = make_complexity_curve(spam_cv.best_estimator_, X_trans_spam_train.astype(float), y_spam_train.astype(float), "learning_rate", [0.01,0.05,0.1,0.3,1], "boosting complexity learning rate", "spam")

In [None]:
spam_cc_learning_rate.plot(title=spam_cc_learning_rate.name)

In [None]:
adult_cc_md = make_complexity_curve(adult_cv.best_estimator_, X_trans_adult_train.astype(float), y_adult_train.astype(float), "base_estimator__max_depth", list(np.arange(1, 30, 1)),"boosting complexity learning rate", "adult")

In [None]:
adult_cc_md.plot(title=adult_cc_md.name)

In [None]:
spam_cc_md = make_complexity_curve(spam_cv.best_estimator_, X_trans_spam_train.astype(float), y_spam_train.astype(float), "base_estimator__max_depth", list(np.arange(1, 30, 1)),"boosting complexity learning rate", "spam")

In [None]:
spam_cc_md.plot(title=spam_cc_md.name)

**Timing curve**

In [None]:
adult_timing = make_timing_curve(adult_cv.best_estimator_,X_trans_adult_train.astype(float), y_adult_train.astype(float),'boositng timing', 'adult')

In [None]:
adult_timing.plot(title=adult_timing.name)

In [None]:
spam_timing = make_timing_curve(spam_cv.best_estimator_,X_trans_spam_train.astype(float), y_spam_train.astype(float),'boositng timing', 'spam')

In [None]:
spam_timing.plot(title=spam_timing.name)

**Learning Curve**

In [None]:
adult_lc = make_learning_curve(adult_cv.best_estimator_,X_trans_adult_train.astype(float), y_adult_train.astype(float),np.linspace(0.1, 1.0, 10),'boosting timing', 'adult')

In [None]:
adult_lc.plot(title=adult_lc.name)

In [None]:
spam_lc = make_learning_curve(spam_cv.best_estimator_,X_trans_spam_train.astype(float), y_spam_train.astype(float),np.linspace(0.1, 1.0, 10),'boosting timing', 'spam')

In [None]:
spam_lc.plot(title=spam_lc.name)