In [1]:
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

In [2]:
bucket = 'zillow-kaggle-awsml-2020'
prefix = 'regression'

In [8]:
role = get_execution_role()

In [3]:
region_name = boto3.Session().region_name
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [4]:
experiment_root = 's3://{}/{}/'.format(bucket,prefix)

In [51]:
train_data = sagemaker.session.s3_input(experiment_root+'train1.csv', distribution='FullyReplicated', 
                        content_type='text/csv', s3_data_type='S3Prefix')
val_data = sagemaker.session.s3_input(experiment_root+'valid1.csv', distribution='FullyReplicated', 
                        content_type='text/csv', s3_data_type='S3Prefix')
test_data = sagemaker.session.s3_input(experiment_root+'test1.csv', distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')

In [53]:


sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m5.2xlarge',
                                       output_path=experiment_root,
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=93,
                           predictor_type='regressor',
                           loss='eps_insensitive_absolute_loss',
                           loss_insensitivity=.03,
                           mini_batch_size=200)

linear.fit({'train': train_data, 'validation':val_data })

2020-01-30 22:14:23 Starting - Starting the training job...
2020-01-30 22:14:24 Starting - Launching requested ML instances......
2020-01-30 22:15:30 Starting - Preparing the instances for training...
2020-01-30 22:16:17 Downloading - Downloading input data...
2020-01-30 22:16:53 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34m[01/30/2020 22:17:08 INFO 140130886924096] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'0.01', u'lr_scheduler_min

In [None]:
job_name = 'DEMO-xgboost-regression-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

#Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": experiment_root + "/single-xgboost"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m5.4xlarge",
        "VolumeSizeInGB": 15
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "max_depth":"7",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"1.0",
        "silent":"0",
        "objective":"reg:linear",
        "num_round":"50"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": experiment_root + '/train1.csv',
                    'content_type':'text/csv',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": experiment_root + '/valid1.csv',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "test",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": experiment_root + '/test1.csv',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        }
    ]
}