In [1]:
! pip install transformers datasets evaluate accelerate




In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::661082688832:role/service-role/parastest-role-4l0z5x30
sagemaker bucket: sagemaker-eu-west-2-661082688832
sagemaker session region: eu-west-2


In [3]:

from datasets import load_dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])

In [4]:
train_dataset = train_dataset.shuffle().select(range(1000))
test_dataset = test_dataset.shuffle().select(range(100)) # smaller the size for test dataset to 10k 


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
tokenized_imdb_train  =  train_dataset.map(preprocess_function, batched=True)
tokenized_imdb_test   =  test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
# save train_dataset to s3

s3_prefix = 'pytorch_training/datasets/imdb'
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
tokenized_imdb_train.save_to_disk(training_input_path)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
tokenized_imdb_test.save_to_disk(test_input_path)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
!pygmentize scripts/train.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34mimport[39;49;00m [04m[36mlogging[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoModelForSequenceClassification, TrainingArguments, Trainer[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoTokenizer[37m[39;49;00m
[34mimport[39;49;00m [04m[36mevaluate[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m DataCollatorWithPadding[37

In [10]:

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 4,
                 'eval_batch_size':4,
                 'model_name':'distilbert-base-uncased'
                 }

In [11]:
from sagemaker.pytorch import PyTorch
pytorch_estimator = PyTorch('train.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            role=role,
                            instance_count=1,
                            framework_version='2.1.0',
                            py_version='py310',
                            hyperparameters = hyperparameters
                           )

In [12]:
pytorch_estimator.fit({'train': training_input_path, 'test': test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-04-05-08-06-26-761


2024-04-05 08:06:27 Starting - Starting the training job...
2024-04-05 08:06:38 Pending - Training job waiting for capacity...
2024-04-05 08:07:12 Pending - Preparing the instances for training......
2024-04-05 08:08:26 Downloading - Downloading the training image..................
2024-04-05 08:11:22 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-04-05 08:11:42,906 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-04-05 08:11:42,923 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-04-05 08:11:42,936 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-04-05 08:11:42,938 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-04-05 08:

In [13]:
import transformers
import datasets
import evaluate
import accelerate
import torch

In [14]:
print(transformers.__version__)
print(datasets.__version__)
print(evaluate.__version__)
print(accelerate.__version__)
print(torch.__version__)



4.39.3
2.18.0
0.4.1
0.28.0
2.1.0
