# Project: AutoGluon Prototyping

The code below is an exmple of using the SageMaker SKLearn container to train AutoGluon modes for a dataset. 

# Setup

In [None]:
import tarfile
import pickle

from sklearn.metrics import r2_score

from sagemaker import get_execution_role
from sagemaker import image_uris
from sagemaker.pytorch import PyTorch
import sagemaker, boto3
from sagemaker.sklearn import SKLearn
from autogluon.tabular import TabularPredictor

from util import DataUtil

project_bucket = '<A S3 bucket>'
train_bucket = 'train'
train_file = 'train.csv'
test_file = 'test.csv'
model_folder = 'model'
instance_type = 'ml.m5.12xlarge'
n_jobs = 48
target_variable = '<Dataset target variable>'

model_output = 's3://{}/{}'.format(project_bucket, model_folder)
print(f'Model output bucket: {model_output}')

In [None]:
image_uri = image_uris.retrieve(framework='sklearn', region='us-east-1',
                    version='1.2-1', py_version='py3',
                    image_scope='training',
                    instance_type=instance_type)
image_uri

# Train AutoGluon

In [None]:
%%time
aws_role = get_execution_role()
sagemaker_session = sagemaker.Session()

env = {'SAGEMAKER_REQUIREMENTS': 'requirements.txt'}

model = SKLearn(
    role=aws_role,
    sagemaker_session=sagemaker_session,
    output_path=model_output,
    code_location=model_output,
    entry_point="train.py",
    source_dir='./container_scripts',
    env=env,
    image_uri=image_uri,
    instance_count=1,
    instance_type=instance_type,
    hyperparameters={"n_jobs": n_jobs, 
                     'training_fraction': 0.20, 
                     'time_limit': 1800},
    use_spot_instances=True,
    max_run=2000, 
    max_wait=2000,
)

model.fit()

# Retrieve trained AutoGluon model

## Get archive from S3.

In [None]:
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')

contents = s3_client.list_objects_v2(Bucket=project_bucket, Prefix=model_folder).get('Contents', [])
last_sklearn_model = None
for content in contents:
    if 'sagemaker-scikit-learn' in content['Key'] \
      and 'model.tar.gz' in content['Key']:
      last_sklearn_model = content['Key']

print(last_sklearn_model)
s3_resource.meta.client.download_file(project_bucket,
                                      last_sklearn_model,
                                      './model.tar.gz')
t = tarfile.open('./model.tar.gz', 'r:gz')
t.extractall()

## Load model

In [None]:
predictor = TabularPredictor.load('./AutoGluon', check_packages=False,
                                  require_py_version_match=False)

## Review Results Model

In [None]:
print(predictor.model_best)
print(predictor.model_names())

In [31]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-0.019041,root_mean_squared_error,15.120476,187.099585,0.003596,0.246937,3,True,8
1,XGBoost_BAG_L2,-0.019057,root_mean_squared_error,14.131661,158.935625,0.479173,2.398523,2,True,7
2,CatBoost_BAG_L2,-0.019079,root_mean_squared_error,13.727046,177.533962,0.074558,20.99686,2,True,6
3,LightGBM_BAG_L2,-0.019103,root_mean_squared_error,14.563149,163.457265,0.910661,6.920163,2,True,5
4,CatBoost_BAG_L1,-0.019986,root_mean_squared_error,0.099253,113.046851,0.099253,113.046851,1,True,2
5,WeightedEnsemble_L2,-0.019986,root_mean_squared_error,0.102937,113.207689,0.003684,0.160838,2,True,4
6,XGBoost_BAG_L1,-0.020904,root_mean_squared_error,4.336347,21.253509,4.336347,21.253509,1,True,3
7,LightGBM_BAG_L1,-0.020917,root_mean_squared_error,9.216888,22.236743,9.216888,22.236743,1,True,1
