In [None]:
!pip install pyarrow==2 awswrangler

In [None]:
import os
import numpy as np
import pandas as pd
import boto3
import sagemaker
import awswrangler as wr
from sklearn.model_selection import train_test_split
from sagemaker.pytorch import PyTorch
from sagemaker.tuner import (
    CategoricalParameter,
    HyperparameterTuner,
)

In [None]:
sagemaker_session = sagemaker.Session()

bucket = 'cgu-poc-sagemaker'
data_prefix = 'datasets/data-wrangler-feedbacks-2022-06-14T03-02-28'
model_prefix = 'models/feedbacks'

role = sagemaker.get_execution_role()

In [None]:
df = wr.s3.read_csv(f"s3://{bucket}/{data_prefix}/")
df.head()

In [None]:
train, test = train_test_split(df)
df.to_csv('./data/dataset.csv', index=False)
train.to_csv("./data/feedbacks_train.csv", index=False)
test.to_csv("./data/feedbacks_test.csv", index=False)

In [None]:
inputs_train = sagemaker_session.upload_data("./data/feedbacks_train.csv", bucket=bucket, key_prefix=f'{data_prefix}/train')
inputs_test = sagemaker_session.upload_data("./data/feedbacks_test.csv", bucket=bucket, key_prefix=f'{data_prefix}/test')

In [None]:
# place to save model artifact
output_path = f"s3://{bucket}/{model_prefix}"

estimator = PyTorch(
    entry_point="train.py",
    source_dir="script",
    role=role,
    framework_version="1.10.0",
    py_version="py38",
    instance_count=1,  # this script only support distributed training for GPU instances.
    instance_type="ml.p3.8xlarge",
    # instance_type="local",
    output_path=output_path,
    hyperparameters={
        "batch-size": 16,
        "epochs": 1,
        "num_labels": 2,
        "backend": "gloo",
    },
    disable_profiler=True, # disable debugger
)

In [None]:
estimator.fit({"training": inputs_train, "testing": inputs_test})

In [None]:
hyperparameter_ranges = {
    "lr": CategoricalParameter([3e-4, 1e-4, 5e-5, 3e-5]),
    "batch-size": CategoricalParameter([4, 8, 16]),
}

# change to accuracy
objective_metric_name = "accuracy"
objective_type = "Maximize"
metric_definitions = [{"Name": "accuracy", "Regex": "=====>#011{'accuracy': ([0-9\\.]+)"}]

In [None]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=200,
    max_parallel_jobs=10,
    objective_type=objective_type,
)

In [None]:
tuner.fit({"training": inputs_train, "testing": inputs_test})