In [1]:
# !pip install "sagemaker>=2.48.0" "transformers==4.6.1" "datasets[s3]==1.6.2" --upgrade
# !pip install sagemaker transformers 
# !pip install torch

In [6]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::448807757624:role/service-role/AmazonSageMaker-ExecutionRole-20211202T101582
sagemaker bucket: sagemaker-us-east-2-448807757624
sagemaker session region: us-east-2


In [3]:
%%capture
import IPython
!conda install -c conda-forge ipywidgets -y
IPython.Application.instance().kernel.do_shutdown(True) # has to restart kernel so changes are used

In [8]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

s3_prefix = 'datasets/xsum_corrupted_entityonly_untokenized'

xsum_corrupted_train = load_from_disk("data/xsum_corrupted_entityonly/train")
# xsum_corrupted_val= load_dataset("data/xsum_corrupted/val")
# xsum_corrupted_test= load_dataset("data/xsum_corrupted/test")

In [4]:
"""
Download Data: this takes a while for some reason.
"""
column_names = xsum_corrupted_train.column_names
text_column, corrupted_column, target_column = "document", "corrupted_summary", "summary"

In [5]:
"""
Process Data
"""

MODEL_NAME = 'facebook/bart-large'
max_source_length = 1024
max_target_length = 128
padding=False

tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, cache_dir=None, use_fast=True, revision="main", use_auth_token=False,
)

sep_token = '</s>'

def preprocess_function(examples):

    inputs = examples[text_column]
    corruptions = examples[corrupted_column]
    targets = examples[target_column]
    
    # Tokenize Input
    model_inputs = tokenizer(
        text = inputs, text_pair=corruptions, max_length=max_source_length, 
        padding=padding, truncation='only_first', add_special_tokens=True
    )
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, padding=padding, truncation=True
        )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

train_dataset = xsum_corrupted_train
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=None,
    remove_columns=column_names,
    load_from_cache_file=False,
)

# val_dataset = xsum_filtered_val
# val_dataset = val_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=None,
#     remove_columns=column_names,
#     load_from_cache_file=False,
# )

  0%|          | 0/136 [00:00<?, ?ba/s]

In [19]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
# val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [12]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

# # save test_dataset to s3
# test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
# val_dataset.save_to_disk(test_input_path,fs=s3)