In [1]:
import os

import numpy as np
import pandas as pd

from pprint import pprint
import matplotlib.pyplot as plt
from time import gmtime, strftime

import sklearn.model_selection

## Setting up the notebook


In [2]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

# This is an object that represents the SageMaker session that we are currently operating in. This
# object contains some useful information that we will need to access later such as our region.
session = sagemaker.Session()

# This is an object that represents the IAM role that we are currently assigned. When we construct
# and launch the training job later we will need to tell it what IAM role it should have. Since our
# use case is relatively simple we will simply assign the training job the role we currently have.
role = get_execution_role()

## Preparing and splitting the data

This is the files that were saved during the preprocessing of the data set

In [3]:
train = pd.read_csv("data/train_data.csv",index_col=0)

In [60]:
train.head()

Unnamed: 0,segment_id,time_to_eruption,sensor_1_mean,sensor_2_mean,sensor_3_mean,sensor_4_mean,sensor_5_mean,sensor_6_mean,sensor_7_mean,sensor_8_mean,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
0,1136037770,12262005,-1.610323,4.613823,3.559724,2.019983,,-9.718855,7.060032,1.268362,...,,50607.0,,16413.0,,51787.0,5705.0,39659.0,43724.0,43845.0
1,1969647810,32739612,1.426126,2.484442,-2.79427,-1.700572,0.732104,-8.975067,5.870219,-1.086832,...,58311.0,15205.0,58552.0,17409.0,494.0,18153.0,18333.0,17989.0,17279.0,58131.0
2,1895879680,14965999,2.504592,5.449709,-1.58854,-1.150831,,-13.328361,-1.957634,-4.111615,...,,6908.0,,45035.0,,34602.0,50081.0,,28581.0,44479.0
3,2068207140,26469720,2.165797,-0.373377,-2.698988,-1.316861,0.308795,-0.836886,-0.843136,-0.175314,...,,53086.0,,,53292.0,57272.0,54587.0,19648.0,53769.0,54010.0
4,192955606,31072429,0.073815,,1.998883,0.245079,-0.441376,2.205647,-4.784354,-1.730321,...,,,,,,21552.0,,13668.0,58864.0,20384.0


In [61]:
train.shape

(4431, 622)

In [4]:
test = pd.read_csv("data/test_data.csv", index_col=0)

In [214]:
test.head()

Unnamed: 0,segment_id,sensor_1_mean,sensor_2_mean,sensor_3_mean,sensor_4_mean,sensor_5_mean,sensor_6_mean,sensor_7_mean,sensor_8_mean,sensor_9_mean,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
0,1000213997,4.462476,,3.58469,-3.677255,0.932501,-0.119815,-0.801103,-0.580557,-2.540874,...,,,19453.0,20332.0,,8178.0,59843.0,34814.0,38253.0,
1,100023368,0.838103,,1.009217,-0.757037,0.340894,1.764721,-8.099515,1.104198,2.450759,...,,,,,,59753.0,57627.0,58366.0,,14582.0
2,1000488999,0.794903,-4.740871,2.116098,-0.38061,,12.569107,1.533874,2.943134,-0.932868,...,21910.0,43407.0,,,,15508.0,,,,
3,1001028887,1.953717,,0.40436,-0.522891,-3.765535,-1.218796,-1.544341,1.841369,0.78352,...,,,,,,40007.0,24916.0,18768.0,,
4,1001857862,-3.984584,-3.306728,0.898585,-2.38991,-0.001767,2.661156,0.937434,1.873502,3.618723,...,,50019.0,,50572.0,,43022.0,32276.0,49597.0,,50793.0


Splitting the data in to training, validation and testing sets

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [6]:
# We split the dataset into training and testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(train.drop(columns=["segment_id","time_to_eruption"]),
                                                    train[["time_to_eruption"]],
                                                    test_size=0.20)

# Then we split the training set further into training and validation sets.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.20)


Features

In [7]:
fs = ["_mean","_std","_max","_min","_mad","_skew","_kurt","_nunique",
      "_quantile_05","_quantile_10","_quantile_30","_quantile_70","_quantile_90","_quantile_95",
      "_fft_power_mean","_fft_power_std","_fft_power_min","_fft_power_max",
      "_fft_power_sum_low","_fft_power_sum_middle","_fft_power_sum_high",
      "_fft_power_mad","_fft_power_skew","_fft_power_kurt","_fft_power_nunique",
      "_fft_power_quantile_05","_fft_power_quantile_10","_fft_power_quantile_30","_fft_power_quantile_70",
      "_fft_power_quantile_90","_fft_power_quantile_95",
      "_cross_0_count",
      "_roll_mean_min","_roll_mean_max","_roll_dist_min","_roll_dist_max","_roll_dist_diff_min","_roll_dist_diff_max"
     ]

cols = [f"sensor_{i}{_fs}" for i in range(1,11) for _fs in fs]

I also wanted to see the difference in model performance if I scale the data before training.

In [8]:
scale_X = StandardScaler()
scale = StandardScaler()

In [9]:
# Scale X
scaled_train_X = pd.DataFrame(scale_X.fit_transform(X_train[cols]),index=X_train.index,columns=cols)
scaled_val_X = pd.DataFrame(scale_X.fit_transform(X_val[cols]),index=X_val.index,columns=cols)
scaled_test_X  = pd.DataFrame(scale_X.transform(X_test[cols])     ,index=X_test.index ,columns=cols)
scaled_test    = pd.DataFrame(scale_X.transform(test[cols])       ,index=test.index   ,columns=cols)

# Scale y
scaled_train_y = scale.fit_transform(Y_train)[:,0]
scaled_val_y = scale.fit_transform(Y_val)[:,0]
scaled_test_y  = scale.transform(Y_test)[:,0]

In [10]:
scaled_test    = pd.DataFrame(scale_X.transform(test[cols])       ,index=test.index   ,columns=cols)

## Uploading the training and validation files to S3

In [11]:
# This is our local data directory. We need to make sure that it exists.
data_dir = '../data/volcano'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [12]:
prefix = 'volcano-eruption'

### Not Scaled Data

In [13]:
#not scaled
pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([pd.DataFrame(X_test)], 
          axis=1).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [15]:
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

### Scaled data

In [14]:
#scaled
pd.concat([pd.DataFrame(scaled_val_y), pd.DataFrame(scaled_val_X)], 
          axis=1).to_csv(os.path.join(data_dir, 'validation-scaled.csv'), header=False, index=False)
pd.concat([pd.DataFrame(scaled_train_y), pd.DataFrame(scaled_train_X)], 
          axis=1).to_csv(os.path.join(data_dir, 'train-scaled.csv'), header=False, index=False)
pd.concat([pd.DataFrame(scaled_test_X)],
          axis=1).to_csv(os.path.join(data_dir, 'test-scaled.csv'), header=False, index=False)

In [16]:
val_sc_location = session.upload_data(os.path.join(data_dir, 'validation-scaled.csv'), key_prefix=prefix)
train_sc_location = session.upload_data(os.path.join(data_dir, 'train-scaled.csv'), key_prefix=prefix)
test_sc_location = session.upload_data(os.path.join(data_dir, 'test-scaled.csv'), key_prefix=prefix)

## Train the XGBoost model

#### First, let's try training a model with the unscaled data

In [17]:
xgb_container = get_image_uri(session.boto_region_name, 'xgboost',repo_version='1.2-1')

# Estimator object
xgb = sagemaker.estimator.Estimator(xgb_container, # The name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    train_instance_count=1, # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge', # The type of instance ot use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [18]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

In [19]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='text/csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='text/csv')

Without hyperparameter tuning

In [20]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-12-15 20:54:46 Starting - Starting the training job...
2020-12-15 20:54:50 Starting - Launching requested ML instancesProfilerReport-1608065686: InProgress
......
2020-12-15 20:56:05 Starting - Preparing the instances for training......
2020-12-15 20:57:14 Downloading - Downloading input data
2020-12-15 20:57:14 Training - Downloading the training image......
2020-12-15 20:58:11 Uploading - Uploading generated training model[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CS

In [21]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [22]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

................................[34m[2020-12-15:21:04:36:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-12-15:21:04:36:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-12-15:21:04:36:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

 

In [23]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-west-1-428239323951/sagemaker-xgboost-2020-12-15-20-59-24-440/test.csv.out to ../data/volcano/test.csv.out


In [24]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

In [25]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(Y_test, Y_pred)

4564464.419884442

In [27]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

---------------!

In [29]:
xgb_predictor.serializer = csv_serializer

In [30]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [31]:
submit = pd.DataFrame(predict(test.drop(columns=["segment_id"]).values),
                      index=test["segment_id"],
                      columns=["time_to_eruption"],
                      dtype="int")
submit.clip(lower=0,inplace=True)

submit.to_csv("submission.csv")

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has b

Using the deployed model on the test submission data from the competition, I was able to score 7284365 MAE and rank in the top 67% of the participants.

Let's try with hyperparameter tuning

In [37]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

In [230]:
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:mae', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [231]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

.........................................................................................................................................................................................................................................................................................................................................................................................!


In [232]:
xgb_hyperparameter_tuner.best_training_job()

'sagemaker-xgboost-201215-0457-013-4b1bc726'

In [233]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())


2020-12-15 05:19:20 Starting - Preparing the instances for training
2020-12-15 05:19:20 Downloading - Downloading input data
2020-12-15 05:19:20 Training - Training image download completed. Training in progress.
2020-12-15 05:19:20 Uploading - Uploading generated training model
2020-12-15 05:19:20 Completed - Training job completed


In [234]:
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [235]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

................................[34m[2020-12-15:05:37:02:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-12-15:05:37:02:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-12-15:05:37:02:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[35m[2020-12-15:05:37:02:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2020-12-15:05:37:02:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2020-12-15:05:37:02:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;
[0m
[35mworker_rlimit_nofile 4096;
[0m
[35mevents {
  worker_connections 2048;[0m
[35m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout com

In [240]:
#with tuning
xgb_predictor = xgb_attached.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

---------------!

In [243]:
predictions.shape

(4520,)

In [245]:
submit = pd.DataFrame(predict(test.drop(columns=["segment_id"]).values),
                      index=test["segment_id"],
                      columns=["time_to_eruption"],
                      dtype="int")
submit.clip(lower=0,inplace=True)

display(submit)

submit.to_csv("submission.csv")

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has b

Unnamed: 0_level_0,time_to_eruption
segment_id,Unnamed: 1_level_1
1000213997,18518574
100023368,36786112
1000488999,24943958
1001028887,28612706
1001857862,15425858
...,...
996704281,25581620
997630809,19247926
998072137,19276954
998136924,35053460


Using the deployed model on the test submission data from the competition, I was able to score 5672930 MAE and rank in the top 35% of the participants which is a huge jump from the previous model without hyperparameter tuning.

In [236]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-west-1-428239323951/sagemaker-xgboost-2020-12-15-05-31-51-572/test.csv.out to ../data/volcano/test.csv.out


In [237]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

In [238]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(Y_test, Y_pred)

3312358.1670307214

In [239]:
Y_test

Unnamed: 0,time_to_eruption
730,44382960
600,37982622
662,1520211
3023,41919713
1661,9631599
...,...
1093,34500541
3346,7642741
942,33464579
614,39475415


#### Next, let's try training a model with the scaled data

In [32]:
# Estimator object
xgb_scaled = sagemaker.estimator.Estimator(xgb_container, # The name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    train_instance_count=1, # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge', # The type of instance ot use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [33]:
xgb_scaled.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

In [34]:
s3_input_trainsc = sagemaker.inputs.TrainingInput(s3_data=train_sc_location, content_type='text/csv')
s3_input_validationsc = sagemaker.inputs.TrainingInput(s3_data=val_sc_location, content_type='text/csv')

In [39]:
xgb_hyperparameter_tuner_sc = HyperparameterTuner(estimator = xgb_scaled, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:mae', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [40]:
xgb_hyperparameter_tuner_sc.fit({'train': s3_input_trainsc, 'validation': s3_input_validationsc})

...................................................................................................................................................................................................................................................................................................................................................!


In [41]:
xgb_attached_sc = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner_sc.best_training_job())


2020-12-15 22:01:04 Starting - Preparing the instances for training
2020-12-15 22:01:04 Downloading - Downloading input data
2020-12-15 22:01:04 Training - Training image download completed. Training in progress.
2020-12-15 22:01:04 Uploading - Uploading generated training model
2020-12-15 22:01:04 Completed - Training job completed


In [42]:
xgb_transformer_sc = xgb_attached_sc.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [43]:
xgb_transformer_sc.transform(test_sc_location, content_type='text/csv', split_type='Line')

.................................
.[32m2020-12-15T22:08:11.260:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m[2020-12-15:22:08:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-12-15:22:08:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2020-12-15:22:08:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2020-12-15:22:08:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-12-15:22:08:09:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[35m[2020-12-15:22:08:09:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;
[0m
[35mworker_rlimit_nofile 4096;
[0m
[35mevents {
  worker_connections 2048;[0m


In [44]:
xgb_predictor_sc = xgb_attached_sc.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

---------------!

In [45]:
xgb_predictor_sc.serializer = csv_serializer

In [46]:
def predict_sc(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor_sc.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [64]:
submit = pd.DataFrame(scale.inverse_transform(predict_sc(scaled_test.values)),
                      index=test["segment_id"],
                      columns=["time_to_eruption"],
                      dtype="int")
submit.clip(lower=0,inplace=True)

display(submit)

submit.to_csv("submission.csv")

# scale.inverse_transform(gbm.predict(scaled_test)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has b

Unnamed: 0_level_0,time_to_eruption
segment_id,Unnamed: 1_level_1
1000213997,23613386
100023368,23843159
1000488999,24037344
1001028887,23031043
1001857862,23690650
...,...
996704281,23793905
997630809,22714653
998072137,23463363
998136924,23513998


This performed worse than the model using unscaled data. I scored 11313082 MAE on this one.