In [1]:
!pip install datasets transformers



In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::661082688832:role/service-role/parastest-role-4l0z5x30
sagemaker bucket: sagemaker-eu-west-2-661082688832
sagemaker session region: eu-west-2


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

# tokenizer used in preprocessing
tokenizer_name = 'distilbert-base-uncased'

# dataset used
dataset_name = 'imdb'

# s3 key prefix for the data
s3_prefix = 'samples/datasets/imdb'

In [4]:
# load dataset
dataset = load_dataset(dataset_name)

# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
train_dataset = train_dataset.shuffle().select(range(1000))
test_dataset = test_dataset.shuffle().select(range(100)) # smaller the size for test dataset to 10k 


# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [5]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
!pygmentize scripts/train.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer[37m[39;49;00m
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmetrics[39;49;00m [34mimport[39;49;00m accuracy_score, precision_recall_fscore_support[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk[37m[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mif[39;49;00m [31m__name__[39;49;00m == [33m"[39;49;00m[33m__main__[39;49;00m[33m"[39;49;00m:[37m[39;49;00m

In [7]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased'
                 }

In [15]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.26',
                            pytorch_version='1.13',
                            py_version='py39',
                            hyperparameters = hyperparameters)

In [16]:
imdbhuggingface_estimator.fit({'train': training_inpuimdbt_path, 'test': test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-04-04-21-23-45-337


2024-04-04 21:23:45 Starting - Starting the training job...
2024-04-04 21:24:02 Pending - Training job waiting for capacity...
2024-04-04 21:24:38 Pending - Preparing the instances for training......
2024-04-04 21:25:39 Downloading - Downloading input data...
2024-04-04 21:26:09 Downloading - Downloading the training image..................
2024-04-04 21:28:50 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-04-04 21:29:08,847 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-04-04 21:29:08,868 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-04-04 21:29:08,881 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-04-04 21:29:08,884 sagemaker_pytorch_container.training INFO  

[34mInstalling collected packages: tokenizers, transformers, datasets[0m
[34mAttempting uninstall: tokenizers[0m
[34mFound existing installation: tokenizers 0.13.3[0m
[34mUninstalling tokenizers-0.13.3:[0m
[34mSuccessfully uninstalled tokenizers-0.13.3[0m
[34mAttempting uninstall: transformers[0m
[34mFound existing installation: transformers 4.26.0[0m
[34mUninstalling transformers-4.26.0:[0m
[34mSuccessfully uninstalled transformers-4.26.0[0m
[34mAttempting uninstall: datasets[0m
[34mFound existing installation: datasets 2.16.1[0m
[34mUninstalling datasets-2.16.1:[0m
[34mSuccessfully uninstalled datasets-2.16.1[0m
[34mSuccessfully installed datasets-2.18.0 tokenizers-0.15.2 transformers-4.39.3[0m
[34m[notice] A new release of pip is available: 23.2.1 -> 24.0[0m
[34m[notice] To update, run: pip install --upgrade pip[0m
[34m2024-04-04 21:29:22,921 sagemaker-training-toolkit INFO     Waiting for the process to finish and give a return code.[0m
[34m2024-0

[34mSome weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight'][0m
[34mYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.[0m
[34mSome weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight'][0m
[34mYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.[0m
[34mTraceback (most recent call last):
  File "/opt/ml/code/train.py", line 73, in <module>[0m
[34mtrainer = Trainer(
  File "/opt/conda/lib/python3.9/site-packages/transformers/trainer.py", line 373, in __init__[0m
[34mself.create_accelerator_and_postp

UnexpectedStatusException: Error for Training job huggingface-pytorch-training-2024-04-04-21-23-45-337: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "TypeError: __init__() got an unexpected keyword argument 'use_seedable_sampler'"
Command "/opt/conda/bin/python3.9 train.py --epochs 1 --model_name distilbert-base-uncased --train_batch_size 32", exit code: 1