## Imports

In [46]:
%%time
#python and datascience imports
import pandas as pd
import numpy as np
import os
import sys
import time
import itertools

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sagemaker import image_uris


# AWS S3 Imports
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

session = boto3.Session()
sm = sess.client("sagemaker")
role = get_execution_role()
region = boto3.Session().region_name
account_id = sess.client("sts").get_caller_identity()["Account"]


# AWS Experiment Imports
from smexperiments.tracker import Tracker
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent


# personal notebook settings
%config Completer.use_jedi = False
nb_dir = os.getcwd()

CPU times: user 99.1 ms, sys: 6.27 ms, total: 105 ms
Wall time: 207 ms


## Pip installs

In [9]:
!{sys.executable} -m pip install sagemaker-experiments==0.1.24
!{sys.executable} -m pip install --upgrade sagemaker

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting sagemaker
  Using cached sagemaker-2.65.0-py2.py3-none-any.whl
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.59.5
    Uninstalling sagemaker-2.59.5:
      Successfully uninstalled sagemaker-2.59.5
Successfully installed sagemaker-2.65.0
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


## Creating new S3 bucket (does nothing if already there)

In [8]:
%%time
bucket = f"sagemaker-experiments-10mill-{sess.region_name}-{account_id}"
sess.client("s3").create_bucket(Bucket=bucket)

CPU times: user 24.2 ms, sys: 11.4 ms, total: 35.5 ms
Wall time: 629 ms


{'ResponseMetadata': {'RequestId': '5FG2ABNG7DYRE7RV',
  'HostId': 'bXaNEIgtyNgNXQvaF05M73Z5NUdVsQcrlJa+NF1TEpzLu15vJqzJbShrh9I1wsybl+9TZvJ2ub8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'bXaNEIgtyNgNXQvaF05M73Z5NUdVsQcrlJa+NF1TEpzLu15vJqzJbShrh9I1wsybl+9TZvJ2ub8=',
   'x-amz-request-id': '5FG2ABNG7DYRE7RV',
   'date': 'Sat, 23 Oct 2021 22:25:13 GMT',
   'location': '/sagemaker-experiments-10mill-us-east-1-570124035543',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'Location': '/sagemaker-experiments-10mill-us-east-1-570124035543'}

## Creating mock classification data

In [3]:
%%time
X, y = make_classification(n_samples=1_000_000, n_features=100, n_informative=80, 
                          n_redundant=10, n_repeated=5, n_classes=2, weights=[.7, .3], 
                          flip_y=.05, random_state=1)
X = pd.DataFrame(X)
y = pd.Series(y)

display(y.tail(8))
display(X.tail(5))

KeyboardInterrupt: 

In [4]:
%%time
X, y = make_classification(n_samples=30_000, n_features=100, n_informative=80, 
                          n_redundant=10, n_repeated=5, n_classes=2, weights=[.7, .3], 
                          flip_y=.05, random_state=1)

X = pd.DataFrame(X)
y = pd.DataFrame(y, columns=['target'])

display(y.tail(2))
display(X.tail(2))

Unnamed: 0,target
29998,1
29999,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
29998,-10.60406,-5.5083,-3.672563,19.609996,-1.246331,2.803028,5.743784,3.896423,-3.067615,-4.010443,...,48.833051,0.290265,-4.454335,3.554395,3.418017,-6.689105,-5.553461,8.472437,8.148694,-1.110202
29999,-0.919847,7.53131,2.78437,-24.321904,2.608576,0.749658,-4.032186,2.05663,5.015172,1.702627,...,28.135466,-0.672655,-11.253521,6.746637,-2.637672,-3.150747,-6.928609,0.945128,2.136493,-0.24829


CPU times: user 290 ms, sys: 142 ms, total: 431 ms
Wall time: 334 ms


## Splitting data down to a fraction to test compute times

In [5]:
# 10%
X_10 = X[:100000][:]
y_10 = y[:100000][:]
print(X_10.shape)
print(y_10.shape)


X_03 = X[:30000][:]
y_03 = y[:30000][:]
print(X_03.shape)
print(y_03.shape)

(100000, 100)
(100000,)
(30000, 100)
(30000,)


## Splitting data into train / val sets

In [6]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, stratify=y, random_state=1)

df_train = pd.concat([y_train, X_train], axis=1)
df_val = pd.concat([y_test, X_test], axis=1)

display(df_train.tail(2), df_train.shape)
display(df_val.tail(2), df_val.shape)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
16557,0,21.803498,0.751091,3.359956,-39.884887,-1.13928,-2.911881,0.656389,3.968785,2.807057,...,25.106882,-2.09671,-3.50259,1.419765,1.785958,3.374665,2.64138,4.155147,0.505827,-0.213112
20716,0,-33.216827,7.106028,-5.442742,-27.143765,-3.390259,-8.025964,-15.40463,3.327481,5.27556,...,-30.718618,-2.039477,2.655721,10.594298,1.789765,2.293917,4.430483,-4.754043,-1.424962,-1.31171


(27000, 101)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
2407,1,-15.044999,4.549208,-13.915035,28.868817,7.869188,-1.289404,2.62344,6.545484,3.020152,...,-24.690075,4.46184,-3.540316,-2.411477,6.48076,-6.974903,4.985769,8.145822,-3.065228,-0.931803
11486,1,-1.238037,1.107638,-8.594236,-13.900457,2.945535,-3.256778,1.366858,2.8768,2.568071,...,20.976927,4.927918,2.62442,-1.284751,-2.403286,0.628983,-8.596169,-2.367029,-7.480295,1.037427


(3000, 101)

CPU times: user 196 ms, sys: 12.7 ms, total: 208 ms
Wall time: 246 ms


## Exporting to CSV to check file size

In [4]:
%%time
X.to_csv(f"{nb_dir}/x_350m_combo_100f_rs1.csv", index=False)
# y.to_csv(f"{nb_dir}/y_350m_combo_100f_rs1.csv", index=False)

CPU times: user 3min 20s, sys: 2.28 s, total: 3min 22s
Wall time: 3min 52s


## Exporting to Parque, and Uploading to S3

In [19]:
%%time
# X.columns = X.columns.map(str)
# X.to_parquet(f"{nb_dir}/x_350m_combo_100f_rs1.parquet", index=False)

df_train.columns = df_train.columns.map(str)
df_val.columns = df_val.columns.map(str)

df_train.to_parquet(f"{nb_dir}/3mill/train/train_3m_100f_rs1.parquet", index=False)
df_val.to_parquet(f"{nb_dir}/3mill/val/val_3m_100f_rs1.parquet", index=False)

# Uploading EFS folder "3mill" to S3 bucket
prefix = "3mill"
tracker_input_location = sagemaker.Session().upload_data(path="3mill", bucket=bucket, key_prefix=prefix)
print(f"input spec: {tracker_input_location}")

# deleting files in EFS
os.remove(f"{nb_dir}/3mill/train/train_3m_100f_rs1.parquet")
os.remove(f"{nb_dir}/3mill/val/val_3m_100f_rs1.parquet")

input spec: s3://sagemaker-experiments-10mill-us-east-1-570124035543/3mill
CPU times: user 480 ms, sys: 195 ms, total: 675 ms
Wall time: 1.35 s


## Importing from CSV files

In [32]:
X = pd.read_csv(f"{nb_dir}/x_100m_combo_100f_rs1.csv")
y = pd.read_csv(f"{nb_dir}/y_100m_combo_100f_rs1.csv")

## Displaying X dataframe after export / import

In [3]:
display(X.tail(5))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
999995,7.638479,0.147993,-0.374315,-1.905672,2.100724,4.876262,3.539063,3.724045,-0.035807,12.583887,...,-27.349655,3.110569,-0.387177,-0.440046,-3.978399,-3.284399,-5.88802,-2.889172,1.054599,1.856324
999996,4.952854,-4.907971,-3.87443,3.285434,1.330459,3.820075,-4.343467,-0.446642,0.937792,-5.886366,...,-17.715663,-1.304413,-1.772847,-7.154627,-8.490058,-0.966438,-4.778945,-3.230334,4.550058,2.558468
999997,11.011109,2.142859,2.234868,6.290689,-1.94966,-11.516185,-1.778527,8.871909,-0.164823,14.109944,...,-23.813726,-4.86241,0.013348,-5.80183,-1.499662,5.745617,7.858887,4.896642,-6.953329,6.986822
999998,3.451271,6.111332,0.207175,-1.531731,-1.647576,-3.079046,0.488566,2.746394,-0.162541,17.769461,...,35.495048,8.325566,-1.25465,-7.690576,-1.708708,1.298043,-8.494639,3.796856,0.623049,0.414741
999999,1.207356,-2.188729,2.292431,-3.228291,-1.41493,2.512614,-0.476338,-1.169076,1.948679,7.196266,...,-10.082629,7.062203,0.874893,-4.480021,-2.348454,5.003265,6.683871,-1.25195,2.403161,5.155323


## Creating Null values

In [3]:
X = X.mask(np.random.random(X.shape) < .001)
print(X.isnull().sum())

0      954
1      893
2     1045
3     1029
4      968
      ... 
95    1026
96    1021
97     969
98    1021
99     985
Length: 100, dtype: int64


## Dropping rows with null values

In [8]:
# X.dropna(inplace=True)
print(X.isnull().sum())
print(X.shape)

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64
(819131, 100)


## Train / Test Split for standardization and normalization datasets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)
display(X_test.tail(5))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
555867,1.55619,0.020741,0.594385,1.602179,-2.687061,7.625087,5.387809,-5.646649,1.606883,-8.464047,...,-19.094069,-3.341379,-0.106984,-1.739371,-5.572937,0.921987,-2.738149,-6.499573,9.266651,0.374013
30004,1.786718,0.842391,3.965639,-3.174068,-5.188345,2.015973,-4.584984,-5.651028,-1.239148,2.752075,...,-8.751383,-7.896658,-1.146884,0.944707,4.804935,3.795107,2.522585,6.395679,-9.984395,4.980221
124730,-3.222011,-2.431973,4.345308,5.767431,0.497222,-1.337974,-3.637511,1.236147,-0.300381,5.997685,...,9.550286,1.383661,-2.262372,-3.513557,2.771777,-4.568016,-8.681522,-6.421727,-1.138215,-1.81473
195783,1.681436,0.11579,-0.730524,5.119406,7.808748,-5.7682,-1.952731,-6.920947,0.432778,11.822049,...,31.97154,11.068442,1.350913,6.44838,4.515823,2.1338,4.039534,0.323831,-5.551352,14.834031
32132,0.118524,1.635251,-2.128314,-5.030704,2.850023,7.090865,-3.217605,-3.202147,1.142939,10.396271,...,27.270797,-4.72712,0.120959,7.104006,6.068772,2.256933,-5.403182,3.784903,-7.736136,-2.790082


## StandardScaling the dataset

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_s = pd.DataFrame(scaler.transform(X_train))
X_test_s = pd.DataFrame(scaler.transform(X_test))

display(X_test_s.tail(5))
display(X_train_s.tail(5))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
199995,0.233911,-0.130826,-0.08012,0.499653,-0.343484,1.331276,,-1.220713,1.609098,-0.448855,...,-0.729704,-0.660655,-0.107167,-0.261025,-1.109475,0.173521,-0.463218,-1.44998,1.919288,0.014306
199996,0.280302,0.029519,0.585562,-0.415024,-0.852435,0.252221,-0.856933,-1.221555,-1.238648,0.039377,...,-0.334577,-1.487878,-1.14705,0.260386,0.953966,0.712951,0.533857,1.040999,-1.678566,0.907074
199997,-0.727635,-0.609473,0.660531,1.297321,0.304441,-0.392995,-0.679574,0.102822,-0.299315,0.180656,...,0.36461,0.197396,-2.262521,-0.60568,0.549712,-0.85723,-1.589675,-1.434942,-0.025291,-0.409913
199998,0.259115,-0.112277,-0.341734,1.17322,1.792161,-1.245261,-0.364197,-1.465756,0.434286,0.434187,...,1.22118,1.956119,1.350707,1.329534,0.896482,0.40104,0.821366,-0.131902,-0.850068,2.816924
199999,-0.055399,0.184245,-0.617739,-0.77058,0.78318,1.228505,-0.600971,-0.750645,1.144874,0.372124,...,1.041595,-0.912301,0.120773,1.456896,1.205256,0.424158,-0.968326,0.536675,-1.258386,-0.598954


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
799995,-0.109773,1.311153,0.250414,-0.066658,-1.062258,-1.270989,-2.257541,0.17436,-0.277618,0.509769,...,1.738462,1.029706,0.248588,-1.180291,-0.794314,-0.446755,-1.150908,1.905051,-0.103247,0.427607
799996,-0.242395,-0.683968,-1.038902,-0.798364,0.587775,-0.141138,-0.939603,-2.400898,-0.014131,-0.568141,...,0.029812,-0.203461,-0.066536,1.182214,-1.158379,0.656318,-1.107388,0.202305,-0.534834,-1.292867
799997,1.053083,0.432548,-0.000441,1.199118,-0.300708,-0.782744,1.913385,0.965855,-0.266271,1.847446,...,-0.713098,-0.471744,-1.189857,0.795317,-0.654417,-0.820345,2.053118,0.623389,1.967429,-1.96547
799998,-0.887637,0.062255,1.322206,-0.474792,0.172709,-0.296926,-0.19502,1.331353,-1.431327,0.418113,...,-1.617757,0.085753,0.214746,0.0083,1.25909,0.002474,0.131515,-1.18885,-0.710652,-0.046732
799999,0.357036,0.10445,-0.532406,-0.065055,1.065071,-0.116699,-1.105223,-0.416875,0.50561,-2.26177,...,-0.766303,-1.359875,1.582313,0.02307,0.884665,-0.247674,-0.758388,1.448304,1.018088,-0.641026


## MinMax Scaling the dataset

In [7]:
from sklearn.preprocessing import MinMaxScaler
MMscaler = MinMaxScaler()
MMscaler.fit(X_train_s)

X_train_s_mm = pd.DataFrame(MMscaler.transform(X_train_s))
X_test_s_mm = pd.DataFrame(MMscaler.transform(X_test_s))

display(X_test_s_mm.tail(5))
display(X_train_s_mm.tail(5))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
199995,0.537131,0.454154,0.518492,0.560652,0.494506,0.619928,,0.360612,0.675717,0.455385,...,0.399148,0.464638,0.522966,0.463583,0.390091,0.531989,0.449073,0.363569,0.72555,0.520679
199996,0.541848,0.470346,0.585812,0.466083,0.445064,0.51195,0.393341,0.360525,0.379102,0.50555,...,0.440411,0.382849,0.41056,0.516995,0.610566,0.589461,0.556168,0.615147,0.354379,0.610304
199997,0.43936,0.405818,0.593393,0.643123,0.557449,0.447385,0.412022,0.498258,0.476941,0.520066,...,0.513428,0.549474,0.289984,0.428277,0.567372,0.422169,0.328082,0.365088,0.524938,0.478091
199998,0.539694,0.456027,0.492036,0.630292,0.701974,0.362101,0.445241,0.335128,0.553351,0.546116,...,0.60288,0.723361,0.680554,0.626517,0.604424,0.556229,0.587049,0.496689,0.439851,0.802035
199999,0.507714,0.485971,0.464124,0.429322,0.603956,0.609644,0.420301,0.409499,0.627364,0.539739,...,0.584126,0.439757,0.547605,0.639564,0.637416,0.558692,0.39482,0.564212,0.397727,0.459113


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
799995,0.502185,0.599771,0.551919,0.5021,0.424681,0.359527,0.245814,0.505698,0.479201,0.553882,...,0.6569,0.631765,0.561421,0.369414,0.423765,0.465902,0.375209,0.702412,0.516896,0.56217
799996,0.4887,0.398295,0.421533,0.426449,0.584974,0.472588,0.384633,0.237875,0.506645,0.443128,...,0.478465,0.509841,0.527358,0.611426,0.384865,0.583427,0.379884,0.530442,0.472372,0.389451
799997,0.620425,0.511046,0.52655,0.63297,0.498662,0.408384,0.685139,0.588012,0.480382,0.691327,...,0.400882,0.483316,0.405933,0.571793,0.438713,0.426099,0.71935,0.57297,0.730516,0.321928
799998,0.423091,0.473652,0.660307,0.459903,0.544652,0.456999,0.46306,0.626023,0.359033,0.544465,...,0.306408,0.538436,0.557763,0.491172,0.643168,0.513765,0.512953,0.389942,0.454233,0.514551
799999,0.549651,0.477913,0.472754,0.502266,0.631341,0.475033,0.367188,0.44421,0.56078,0.269109,...,0.395326,0.395505,0.70559,0.492685,0.603161,0.487113,0.417369,0.656282,0.632578,0.454889


## Imputing missing values with iterative imputer on Standardized and Normalized Dataset

In [23]:
imputer = IterativeImputer()
imputer.fit(X_test_s_mm)

# X_train_s_mm_i = imputer.fit_transform(X_train_s_scaled_mm)
# X_train_s_mm_i = pd.DataFrame(X_train_s_mm_i)
# display(X.tail(5))
# print(f'{X_train_s_mm_i.isnull().sum()}')



IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [25]:
# X_test_s_mm_i = imputer.transform(X_test_s_mm)
X_test_s_mm_i = pd.DataFrame(X_test_s_mm_i)

display(X_test_s_mm_i.tail(5))
print(f'{X_test_s_mm_i.isnull().sum()}')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
199995,0.537131,0.454154,0.518492,0.560652,0.494506,0.619928,0.589974,0.360612,0.675717,0.455385,...,0.399148,0.464638,0.522966,0.463583,0.390091,0.531989,0.449073,0.363569,0.72555,0.520679
199996,0.541848,0.470346,0.585812,0.466083,0.445064,0.51195,0.393341,0.360525,0.379102,0.50555,...,0.440411,0.382849,0.41056,0.516995,0.610566,0.589461,0.556168,0.615147,0.354379,0.610304
199997,0.43936,0.405818,0.593393,0.643123,0.557449,0.447385,0.412022,0.498258,0.476941,0.520066,...,0.513428,0.549474,0.289984,0.428277,0.567372,0.422169,0.328082,0.365088,0.524938,0.478091
199998,0.539694,0.456027,0.492036,0.630292,0.701974,0.362101,0.445241,0.335128,0.553351,0.546116,...,0.60288,0.723361,0.680554,0.626517,0.604424,0.556229,0.587049,0.496689,0.439851,0.802035
199999,0.507714,0.485971,0.464124,0.429322,0.603956,0.609644,0.420301,0.409499,0.627364,0.539739,...,0.584126,0.439757,0.547605,0.639564,0.637416,0.558692,0.39482,0.564212,0.397727,0.459113


0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64


## Iterative Imputer on untransformed dataset

In [5]:
%%time
imputer = IterativeImputer()
imputer.fit(X_test)

########
# X_train_s_mm_i = imputer.fit_transform(X_train_s_scaled_mm)
# X_train_s_mm_i = pd.DataFrame(X_train_s_mm_i)
# display(X.tail(5))
# print(f'{X_train_s_mm_i.isnull().sum()}')

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [6]:
X_test_i = imputer.transform(X_test)
X_test_i = pd.DataFrame(X_test_i)

display(X_test_s_mm_i.tail(5))
print(f'{X_test_s_mm_i.isnull().sum()}')

NameError: name 'X_test_s_mm_i' is not defined

## Exporting dataset after stand/norm/II

In [26]:
X_test_s_mm_i.to_csv(f"{nb_dir}/X_test_100m_combo_100f_rs1_smmi.csv")

## Specifying training and validation dataset location in S3

In [17]:
s3_train = TrainingInput(s3_data=f's3://{bucket}/{prefix}/train', content_type="application/x-parquet")
s3_val = TrainingInput(s3_data=f's3://{bucket}/{prefix}/val', content_type="application/x-parquet")
inputs = {'train': s3_train, 'validation': s3_val}

## Track custom metrics in experiment metadata by creating a Tracker object

In [21]:
normalization_mean = 0.1307
normalization_std = 0.3081

with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm) as tracker:
    tracker.log_parameters({"normalization_mean": normalization_mean, "normalization_std": normalization_std})
    tracker.log_input(name="10mill-dataset", media_type="s3/uri", value= tracker_input_location)
    
preprocessing_trial_component = tracker.trial_component

## Creating Experiment

In [43]:
exp_10m_datapoints = Experiment.create(experiment_name="10m-datapoints-3", 
                                       description="Trials of experiments using 10 million datapoints or more", 
                                       sagemaker_boto_client=sm)

## Setting up Hyperparameters to check & downloading Xgboost Image

In [35]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.3-1", image_scope='training')


hyperparam_options = {"eta": [0.1, 0.5], "num_round": [10, 20]}
hypnames, hypvalues = zip(*hyperparam_options.items())
trial_hyperparameter_set = [dict(zip(hypnames, h)) for h in itertools.product(*hypvalues)]
trial_hyperparameter_set

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


[{'eta': 0.1, 'num_round': 10},
 {'eta': 0.1, 'num_round': 20},
 {'eta': 0.5, 'num_round': 10},
 {'eta': 0.5, 'num_round': 20}]

## Creating TrainingLoop to create multiple Trials and Trial components

In [48]:
run_number = 1

for i in trial_hyperparameter_set:
    hyperparams = i

    # Create unique job name with hyperparameter and time
    time_append = int(time.time())
    hyp_append = "-".join([str(elm).replace(".", "-") for elm in i.values()])
    training_job_name = f"xgboost-credit-risk-training-{hyp_append}-{time_append}"
    trial_name = f"trial-xgboost-credit-risk-training-{hyp_append}-{time_append}"
    trial_desc = f"my-xgboost-credit-risk-run-{run_number}"

    # Create a new Trial
    xgboost_credit_risk_trial = Trial.create(
        trial_name=trial_name,
        experiment_name=exp_10m_datapoints.experiment_name,
        sagemaker_boto_client=sm,
        tags=[{"Key": "trial-desc", "Value": trial_desc}])

    # Create an experiment config that associates training job to the Trial
    experiment_config = {
        "ExperimentName": exp_10m_datapoints.experiment_name,
        "TrialName": xgboost_credit_risk_trial.trial_name,
        "TrialComponentDisplayName": training_job_name}
    
    xgboost_credit_risk_estimator = sagemaker.estimator.Estimator(
                                    xgboost_container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path=f's3://{bucket}/{prefix}/output',
                                    sagemaker_session=sagemaker.Session(),
                                    hyperparameters=hyperparams,
                                    enable_sagemaker_metrics=True,
                                    tags=[{"Key": "trial-desc", "Value": trial_desc}])


    # Launch a training job
    xgboost_credit_risk_estimator.fit(inputs, job_name=training_job_name, experiment_config=experiment_config)

    # give it a while before dispatching the next training job
    time.sleep(2)
    run_number += 1

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-credit-risk-training-0-1-10-1635031570


2021-10-23 23:26:11 Starting - Starting the training job...
2021-10-23 23:26:34 Starting - Launching requested ML instancesProfilerReport-1635031570: InProgress
.........
2021-10-23 23:27:54 Starting - Preparing the instances for training.........
2021-10-23 23:29:36 Downloading - Downloading input data...
2021-10-23 23:30:04 Training - Downloading the training image...
2021-10-23 23:30:35 Training - Training image download completed. Training in progress..[34m[2021-10-23 23:30:35.895 ip-10-0-204-210.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-23:23:30:36:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-10-23:23:30:36:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-10-23:23:30:36:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-10-23:23:30:36:INFO] Single node training.[0m
[34m[2021-10-23:23:30:36:INFO] Train matrix has 27000 rows and 100 columns[0m
[34m[2021-10-23:23:30:36:I

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-credit-risk-training-0-1-20-1635031893


2021-10-23 23:31:34 Starting - Starting the training job...
2021-10-23 23:31:57 Starting - Launching requested ML instancesProfilerReport-1635031893: InProgress
......
2021-10-23 23:32:57 Starting - Preparing the instances for training............
2021-10-23 23:34:58 Downloading - Downloading input data
2021-10-23 23:34:58 Training - Downloading the training image.....
2021-10-23 23:36:03 Uploading - Uploading generated training model
2021-10-23 23:36:03 Completed - Training job completed
[34m[2021-10-23 23:35:42.955 ip-10-0-205-99.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-23:23:35:43:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-10-23:23:35:43:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-10-23:23:35:43:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-10-23:23:35:43:INFO] Single node training.[0m
[34m[2021-10-23:23:35:43:INFO] Train matrix has 27000 rows and 100 columns

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-credit-risk-training-0-5-10-1635032178


2021-10-23 23:36:19 Starting - Starting the training job...
2021-10-23 23:36:25 Starting - Launching requested ML instancesProfilerReport-1635032178: InProgress
.........
2021-10-23 23:38:12 Starting - Preparing the instances for training.........
2021-10-23 23:39:32 Downloading - Downloading input data...
2021-10-23 23:40:13 Training - Downloading the training image..[34m[2021-10-23 23:40:23.495 ip-10-2-186-199.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-23:23:40:23:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-10-23:23:40:23:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-10-23:23:40:23:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-10-23:23:40:23:INFO] Single node training.[0m
[34m[2021-10-23:23:40:23:INFO] Train matrix has 27000 rows and 100 columns[0m
[34m[2021-10-23:23:40:23:INFO] Validation matrix has 3000 rows[0m
[34m[0]#011train-rmse:0.43154#011validation-rmse

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-credit-risk-training-0-5-20-1635032463


2021-10-23 23:41:04 Starting - Starting the training job...
2021-10-23 23:41:27 Starting - Launching requested ML instancesProfilerReport-1635032463: InProgress
......
2021-10-23 23:42:27 Starting - Preparing the instances for training.........
2021-10-23 23:43:48 Downloading - Downloading input data...
2021-10-23 23:44:28 Training - Downloading the training image..[34m[2021-10-23 23:44:46.416 ip-10-0-149-98.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-23:23:44:46:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-10-23:23:44:46:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-10-23:23:44:46:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-10-23:23:44:46:INFO] Single node training.[0m
[34m[2021-10-23:23:44:46:INFO] Train matrix has 27000 rows and 100 columns[0m
[34m[2021-10-23:23:44:46:INFO] Validation matrix has 3000 rows[0m
[34m[0]#011train-rmse:0.43154#011validation-rmse:0.4

## alternative training loop

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

cvs_trial_name_map = {}

for i, num_cv in enumerate([2, 3, 4, 5, 6]):
    trial_name = f"cvs-training-job-{num_cv}-cv-{int(time.time())}"
    cvs_trial = Trial.create(trial_name=trial_name,
                             experiment_name=experiment_10m_datapoints.experiment_name,
                             sagemaker_boto_client=sm,)
    cvs_trial_name_map[num_cv] = trial_name
    cvs_trial.add_trial_component(preprocessing_trial_component)

    
    
    # all input configurations, parameters, and metrics in estimator are automatically tracked
    estimator = GradientBoostingClassifier(
                    role=role,
                    sagemaker_session=sagemaker.Session(sagemaker_client=sm),
                    instance_count=1,
                    instance_type="ml.c4.xlarge",
                    enable_sagemaker_metrics=True,
                    hyperparameters = {"epochs": 2,
                                       "backend": "gloo",
                                       "hidden_channels": num_hidden_channel,
                                       "dropout": 0.2,
                                       "kernel_size": 5,
                                       "optimizer": "sgd",},
                    metric_definitions=[{"Name": "train:loss", "Regex": "Train Loss: (.*?);"},
                                        {"Name": "test:loss", "Regex": "Test Average loss: (.*?),"},
                                        {"Name": "test:accuracy", "Regex": "Test Accuracy: (.*?)%;"},]))

    
    estimator = PyTorch(role=role,
                        sagemaker_session=sagemaker.Session(sagemaker_client=sm),
                        instance_count=1,
                        instance_type="ml.c4.xlarge",
                        enable_sagemaker_metrics=True,
                        framework_version="1.1.0", #
                        py_version="py3", #
                        entry_point="./mnist.py", #
                        hyperparameters = {"epochs": 2,
                                           "backend": "gloo",
                                           "hidden_channels": num_hidden_channel,
                                           "dropout": 0.2,
                                           "kernel_size": 5,
                                           "optimizer": "sgd",},
                        metric_definitions=[{"Name": "train:loss", "Regex": "Train Loss: (.*?);"},
                                            {"Name": "test:loss", "Regex": "Test Average loss: (.*?),"},
                                            {"Name": "test:accuracy", "Regex": "Test Accuracy: (.*?)%;"},])

    
    
    
    
    
    # Now associate the estimator with the Experiment and Trial
    estimator.fit(inputs = {"training": inputs},
                            job_name = "cnn-training-job-{}".format(int(time.time())),
                            wait=True,
                            experiment_config = {"TrialName": cnn_trial.trial_name,
                                                 "TrialComponentDisplayName": "Training",})
    time.sleep(2)

NameError: name 'time' is not defined

## Time test for running c_v_s() on GBC with untransformed data

In [14]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
scoring = "f1"

cv_results = cross_val_score(GradientBoostingClassifier(), X_03, y_03, cv=2, scoring=scoring)
print(round(cv_results.mean(), 3))

0.692
CPU times: user 1min 34s, sys: 44.4 ms, total: 1min 34s
Wall time: 1min 35s


## Testing running time for GridsearchCV() on GBC with untransformed data

In [11]:
model_gbc.get_params().keys()
GradientBoostingClassifier().get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'presort', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [18]:
np_X_03 = X_03.values
np_y_03 = y_03.values
display(np_X_03.type)

AttributeError: 'numpy.ndarray' object has no attribute 'type'

In [15]:
%%time
from sklearn.model_selection import GridSearchCV
model_gbc = GridSearchCV(estimator = GradientBoostingClassifier(), 
                         param_grid = {'max_depth': [4,6,7],
                                       'n_estimators': [300,600]},
                         cv=2,
                         scoring = scoring)

model_gbc.fit(np_X_03, np_y_03)
print(f'highest mean cv score: {model_gbc.best_score_}, with params: {model_gbc.best_params_}')

highest mean cv score: 0.9021680962218668, with params: {'max_depth': 7, 'n_estimators': 600}
CPU times: user 1h 34min 35s, sys: 356 ms, total: 1h 34min 35s
Wall time: 1h 34min 35s
