# Model Training on Sagemaker

* https://huggingface.co/docs/sagemaker/train
* https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb


## Prerequisites

In [14]:
checkpoint = "distilbert-base-german-cased"

# checkpoint = "deepset/gbert-base"

# checkpoint = "deepset/gelectra-base"

project_name = "10kgnad_huggingface__" + checkpoint.replace("/", "_")

## Sagemaker Environment

In [2]:
pip install -q --upgrade sagemaker

Note: you may need to restart the kernel to use updated packages.


In [None]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## Data Preparation

In [4]:
%env DIR=data

!mkdir -p $DIR
!wget -nc https://github.com/tblock/10kGNAD/blob/master/train.csv?raw=true -nv -O $DIR/train.csv
!wget -nc https://github.com/tblock/10kGNAD/blob/master/test.csv?raw=true -nv -O $DIR/test.csv
!ls -lAh $DIR | cut -d " " -f 5-

env: DIR=data
2021-11-27 17:50:35 URL:https://raw.githubusercontent.com/tblock/10kGNAD/master/train.csv [24405789/24405789] -> "data/train.csv" [1]
2021-11-27 17:50:37 URL:https://raw.githubusercontent.com/tblock/10kGNAD/master/test.csv [2755020/2755020] -> "data/test.csv" [1]

2,7M Nov 27 17:50 test.csv
 24M Nov 27 17:50 train.csv


In [8]:
!pip install -q --upgrade datasets

import os
from datasets import load_dataset, DatasetDict

dataset = load_dataset("csv",
                       data_files={"train": os.getenv("DIR")+"/train.csv",
                                   "test": os.getenv("DIR")+"/test.csv"},
                       sep=";", quotechar="'", names=["labels", "text"]
                       )

print(dataset)
display(dataset['train'].to_pandas().head())

Using custom data configuration default-d3ae6c12d0953d9e


Downloading and preparing dataset csv/default to /home/goerlitz/.cache/huggingface/datasets/csv/default-d3ae6c12d0953d9e/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/goerlitz/.cache/huggingface/datasets/csv/default-d3ae6c12d0953d9e/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 9245
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 1028
    })
})


Unnamed: 0,labels,text
0,Sport,21-Jähriger fällt wohl bis Saisonende aus. Wie...
1,Kultur,"Erfundene Bilder zu Filmen, die als verloren g..."
2,Web,Der frischgekürte CEO Sundar Pichai setzt auf ...
3,Wirtschaft,"Putin: ""Einigung, dass wir Menge auf Niveau vo..."
4,Inland,Estland sieht den künftigen österreichischen P...


In [9]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(ds: DatasetDict):
    le = LabelEncoder()
    le.fit(ds['train']['labels'])

    def encode(data):
        return {'labels': le.transform(data['labels'])}

    return dataset.map(encode, batched=True), le

encoded_ds, le = encode_labels(dataset)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [15]:
!pip install -q --upgrade transformers

from transformers import AdamW, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(data):
    # return tokenizer(data['text'], truncation=True, padding=True)
    return tokenizer(data['text'], padding='max_length', truncation=True)


tokenized_ds = encoded_ds.map(tokenize, batched=True).remove_columns('text')

print(tokenized_ds)
display(tokenized_ds['train'].to_pandas().head())

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/464 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/234k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/468k [00:00<?, ?B/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 9245
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 1028
    })
})


Unnamed: 0,attention_mask,input_ids,labels
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[102, 1735, 232, 19231, 693, 5844, 2134, 378, ...",5
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[102, 11806, 646, 30881, 4195, 205, 13165, 818...",3
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[102, 351, 13236, 124, 7847, 123, 26074, 12309...",6
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[102, 16679, 853, 224, 12205, 818, 377, 268, 5...",7
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[102, 18600, 2671, 190, 13458, 13239, 30882, 5...",1


In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

base_path = f's3://{sess.default_bucket()}/{s3_prefix}'

for data in ["train", "test"]:
    # save dataset to s3
    input_path = f'{base_path}/{data}'
    tokenized_ds[data].save_to_disk(input_path, fs=s3)

## Model Setup

In [None]:
from sagemaker.huggingface import HuggingFace

# hyperparameters which are passed to the training job
hyperparameters={'epochs': 1,
                 'per_device_train_batch_size': 32,
                 'model_name_or_path': checkpoint
                 }


huggingface_estimator = HuggingFace(
    entry_point='train.py',
    source_dir='./scripts',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.6',
    pytorch_version='1.7',
    py_version='py36',
    hyperparameters = hyperparameters)

huggingface_estimator.fit({'train': f'{base_path}/train',
                           'test': f'{base_path}/test'})

In [None]:
# container image used for training job
print(f"container image used for training job: \n{huggingface_estimator.image_uri}\n")

# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{huggingface_estimator.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{huggingface_estimator.latest_training_job.name}\n")

In [None]:
# access the logs of the training job
huggingface_estimator.sagemaker_session.logs_for_job(huggingface_estimator.latest_training_job.name)