In [None]:
import sagemaker.huggingface

In [None]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
import boto3
import os

prefix = 'DEMO-huggingface-finetune-twitter'
s3_input_train_validation = 's3://{}/{}/train'.format(sagemaker_session_bucket, prefix)

boto3.Session().resource('s3').Bucket(sagemaker_session_bucket).Object(os.path.join(prefix, 'train/sem_eval_2018_task_1_train.csv')).upload_file('./dataset/sem_eval_2018_task_1_train.csv')
boto3.Session().resource('s3').Bucket(sagemaker_session_bucket).Object(os.path.join(prefix, 'train/sem_eval_2018_task_1_validation.csv')).upload_file('./dataset/sem_eval_2018_task_1_validation.csv')

s3_input_train_validation

In [None]:
!pygmentize ./scripts/train_nlp_bert_sm_compatible.py 

In [None]:
from sagemaker.huggingface import HuggingFace
import time

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 20,                          # number of training epochs
                 'train_batch_size': 64,               # batch size for training
                 'learning_rate': 0.00001,                # learning rate used during training
                 'model_id': "bert-base-multilingual-uncased", # pre-trained model
                }

In [None]:
# define Training Job Name 
job_name = f'huggingface-finetune-twitter{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train_nlp_bert_sm_compatible.py', # fine-tuning script used in training jon
    source_dir           = './scripts',       # directory where fine-tuning script is stored
    instance_type        = 'ml.p3.2xlarge',   # instances type used for the training job
    #instance_type        = 'local_gpu',   # instances type used for the training job    
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    transformers_version = '4.6',           # the transformers version used in the training job
    pytorch_version      = '1.7',           # the pytorch_version version used in the training job
    py_version           = 'py36',            # the python version used in the training job
    hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
)

In [None]:
# define a data input dictonary with our uploaded s3 uris

data = {
    #'train': "file://./dataset",
    'train': s3_input_train_validation,
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=False)