In [4]:
!pip install sagemaker pyexasol

Collecting pyexasol
  Downloading pyexasol-0.20.0-py3-none-any.whl (44 kB)
[K     |████████████████████████████████| 44 kB 1.6 MB/s  eta 0:00:01
Installing collected packages: pyexasol
Successfully installed pyexasol-0.20.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
from sagemaker.automl.automl import AutoML, AutoMLInput
import pyexasol
from io import BytesIO
from urllib.request import urlopen
import pandas as pd
from zipfile import ZipFile


In [61]:
EXASOL_HOST = "34.244.245.56" # change
EXASOL_PORT = "8563" # change if needed
EXASOL_USER = "sys" # change if needed
EXASOL_PASSWORD = "9UyIRRuYPYaxjeqb" #change
EXASOL_SCHEMA = "IDA"
tags = [{"Key":"exa:owner", "Value": "torsten.kilias@exasol.com"}]

In [46]:
EXASOL_CONNECTION = "{host}:{port}".format(host=EXASOL_HOST, port=EXASOL_PORT)
exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)

In [9]:
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00414/to_uci.zip"
TRAINING_FILE = "to_uci/aps_failure_training_set.csv"
TEST_FILE = "to_uci/aps_failure_test_set.csv"

# Data is preceeded with a 20-line header (copyright & license)
NUM_SKIP_ROWS = 20
NA_VALUE = "na"

resp = urlopen(DATA_URL)
with open('to_uci.zip', 'wb') as f:  
    f.write(resp.read())
    
print("data downloaded")

data downloaded


In [10]:
with ZipFile('to_uci.zip') as z:
    with z.open(TRAINING_FILE, "r") as f:
        train_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)
    with z.open(TEST_FILE, "r") as f:
        test_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)

In [48]:
exasol.execute(query="CREATE SCHEMA IF NOT EXISTS {schema!i}", query_params={"schema": EXASOL_SCHEMA})

<ExaStatement session_id=1707345019852750848 stmt_idx=2>

In [49]:
# Define column names and types
column_names = list(train_set.columns)
column_types = ["VARCHAR(3)"] + ["DECIMAL(18,2)"] * (len(column_names) - 1)
column_desc = [" ".join(t) for t in zip(column_names, column_types)]

params = {"schema": EXASOL_SCHEMA, "column_names": column_names, "column_desc": column_desc}

# Create tables for data
exasol.execute(query="CREATE OR REPLACE TABLE {schema!i}.TRAIN(" + ", ".join(column_desc) + ")", query_params=params)
exasol.execute(query="CREATE OR REPLACE TABLE {schema!i}.TEST LIKE {schema!i}.TRAIN", query_params=params)

# Import data into Exasol
exasol.import_from_pandas(train_set, (EXASOL_SCHEMA, "TRAIN"))
print(f"Imported {exasol.last_statement().rowcount()} rows into TRAIN.")
exasol.import_from_pandas(test_set, (EXASOL_SCHEMA, "TEST"))
print(f"Imported {exasol.last_statement().rowcount()} rows into TEST.")


Imported 60000 rows into TRAIN.
Imported 16000 rows into TEST.


In [50]:
exasol.export_to_pandas("SELECT * FROM {schema!q}.TRAIN LIMIT 4",{"schema": EXASOL_SCHEMA})

Unnamed: 0,CLASS,AA_000,AB_000,AC_000,AD_000,AE_000,AF_000,AG_000,AG_001,AG_002,...,EE_002,EE_003,EE_004,EE_005,EE_006,EE_007,EE_008,EE_009,EF_000,EG_000
0,neg,76698,,2130706438,280.0,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,,0,,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,,228,100.0,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0.0,70,66.0,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32


In [51]:
all_columns = exasol.export_to_pandas("SELECT * FROM {schema!q}.TRAIN LIMIT 1;", {"schema": EXASOL_SCHEMA})
column_names = list(all_columns)
column_names.remove("CLASS")
exasol.execute("""CREATE OR REPLACE TABLE {schema!q}.TRAIN_PREPARED AS (
               SELECT RANDOM() AS SPLIT,
               (CLASS = 'pos') as CLASS_POS, {all_columns_except_class!q} FROM {schema!q}.TRAIN)""",
               { "schema": EXASOL_SCHEMA, "all_columns_except_class": column_names})

<ExaStatement session_id=1707345019852750848 stmt_idx=9>

In [52]:
exasol.export_to_pandas("SELECT * FROM {schema!q}.TRAIN_PREPARED LIMIT 4", {"schema": EXASOL_SCHEMA})

Unnamed: 0,SPLIT,CLASS_POS,AA_000,AB_000,AC_000,AD_000,AE_000,AF_000,AG_000,AG_001,...,EE_002,EE_003,EE_004,EE_005,EE_006,EE_007,EE_008,EE_009,EF_000,EG_000
0,0.794572,0,41116,,0,,0,0,0,0,...,199780,101244,272518,433912,372908,163418,195492,13496,0,0
1,0.793678,0,59726,,78,40.0,0,0,0,0,...,340830,201640,583574,708870,582644,227474,70738,14,0,0
2,0.648744,0,30066,,340,340.0,0,0,0,0,...,258204,129524,265784,294224,199650,91130,111416,3228,0,0
3,0.75258,0,125250,,0,,0,0,0,0,...,794026,510006,1348866,1101396,825410,489878,505230,5224,0,0


In [53]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris
import boto

role = get_execution_role()
sess = sagemaker.Session()
bucket = sess.default_bucket()
bucket

'sagemaker-eu-west-1-216764142018'

In [82]:
columns = exasol.export_to_pandas("SELECT * FROM {schema!q}.TRAIN_PREPARED LIMIT 1;", {"schema": EXASOL_SCHEMA})
column_names = list(columns)
column_names.remove("SPLIT") ## remove split column from select_list
parameters = {"all_columns_except_split": column_names, "schema": EXASOL_SCHEMA, "bucket": bucket}
exasol.execute("""EXPORT (SELECT {all_columns_except_split!q} FROM {schema!q}.TRAIN_PREPARED WHERE SPLIT <= 0.8)
                    INTO CSV AT 'https://{bucket!r}.s3.amazonaws.com'
                    USER '' IDENTIFIED BY '' FILE 'train/train.csv' WITH COLUMN NAMES;""", parameters)
exasol.execute("""EXPORT (SELECT {all_columns_except_split!q} FROM {schema!q}.TRAIN_PREPARED WHERE SPLIT > 0.8)
                    INTO CSV AT 'https://{bucket!r}.s3.amazonaws.com'
                    USER '' IDENTIFIED BY '' FILE 'validation/validation.csv' WITH COLUMN NAMES;""", parameters)


<ExaStatement session_id=1707345019852750848 stmt_idx=22>

In [62]:
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
s3_input_train = TrainingInput(s3_data='s3://{}/train'.format(bucket), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/validation/'.format(bucket), content_type='csv')

container = image_uris.retrieve('xgboost', boto3.Session().region_name, '1.2-1' )
xgb = Estimator(container,
                role,
                instance_count=1,
                instance_type='ml.m4.xlarge',
                output_path='s3://{}/output'.format(bucket),
                tags = tags
               )

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-08-06 12:19:13 Starting - Starting the training job...
2021-08-06 12:19:36 Starting - Launching requested ML instancesProfilerReport-1628252353: InProgress
......
2021-08-06 12:20:37 Starting - Preparing the instances for training......
2021-08-06 12:21:44 Downloading - Downloading input data...
2021-08-06 12:21:58 Training - Downloading the training image.....[34m[2021-08-06 12:22:52.428 ip-10-0-100-78.eu-west-1.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined del

In [74]:
s3_train_path='s3://{}/train'.format(bucket)
automl_job=AutoML(role=role,
                  target_attribute_name="CLASS_POS",
                  max_candidates=10)
s3_input_train = AutoMLInput(inputs=s3_train_path,target_attribute_name="CLASS_POS")


In [83]:
s3 = boto3.client('s3')
s3_object=s3.download_file(bucket,"train/train.csv","train.csv")

In [None]:
automl_job.fit(inputs=s3_input_train)

...................................................................................................................................................................................................................................................