In [1]:
from datasets import load_dataset
from datasets import DatasetDict
import transformers 
import datasets
import logging
import sys
import os


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.getLevelName('INFO'), 
                    handlers=[logging.StreamHandler(sys.stdout)], 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Log versions of dependencies
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Datasets: {datasets.__version__}]')

2023-08-31 15:05:20,894 - __main__ - INFO - [Using Transformers: 4.32.1]
2023-08-31 15:05:20,896 - __main__ - INFO - [Using Datasets: 2.14.4]


In [50]:
# Essentials
# LOCAL_INPUT_PATH is mapped to S3 input location for covid news articles 
LOCAL_INPUT_PATH = './data/' 
# LOCAL_OUTPUT_PATH is mapped to S3 output location where we want to save the processed input data (COVID articles)
LOCAL_OUTPUT_PATH = './'
MAX_LENGTH = 128
CHUNK_SIZE = 128
N_GPUS = 0

# Evaluate custom tokenizer 
logger.info('Evaluating custom tokenizer')
test_sentence = 'amazon simple storage service (amazon s3) is an object storage service that offers industry-leading scalability, data availability, security, and performance.'
logger.info(f'Test sentence: {test_sentence}')
tokens = tokenizer.encode(test_sentence)
logger.info(f'Encoded sentence: {tokens}')
token_id = tokenizer.convert_ids_to_tokens(203)
logger.info(f'Token ID for token (s3) = {token_id}')
vocab_size = tokenizer.vocab_size
logger.info(f'Vocabulary size = {vocab_size}')

# Read dataset and collate to create mini batches for Masked Language Model (MLM) training
logger.info('Reading and collating input data to create mini batches for Masked Language Model (MLM) training')
dataset = load_dataset('text', data_files=f'{LOCAL_INPUT_PATH}/mlm_dataset.txt', split='train')
logger.info(f'Dataset: {dataset}')

# Split dataset into train and validation splits 
logger.info('Splitting dataset into train and validation splits')
train_test_splits = dataset.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_test_splits['train'], 
                           'validation': train_test_splits['test']})
logger.info(f'Data splits: {data_splits}')
    

# Tokenize dataset
def tokenize(article):
    # global tokenizer
    from transformers import BertTokenizerFast
    from transformers import BertConfig
    config = BertConfig()
    MAX_LENGTH = 128
    CHUNK_SIZE = 128
    tokenizer = BertTokenizerFast.from_pretrained("./data/vocab", config=config, padding=True, truncation=True)
    tokenizer.model_max_length = MAX_LENGTH
    tokenizer.init_kwargs['model_max_length'] = MAX_LENGTH
    # print("WE ARE HREEEEE")
    tokenized_article = tokenizer(article['text'])
    # print("WE APSSEDDDDD")
    if tokenizer.is_fast:
        tokenized_article['word_ids'] = [tokenized_article.word_ids(i) for i in range(len(tokenized_article['input_ids']))]
    return tokenized_article


logger.info('Tokenizing dataset splits')
num_proc = int(os.cpu_count()/1)
# num_proc = 1
# logger.info(f'Total number of processes = {num_proc}')
tokenized_datasets = data_splits.map(tokenize, batched=True, num_proc=num_proc, remove_columns=['text'])
logger.info(f'Tokenized datasets: {tokenized_datasets}')


# Concat and chunk dataset 
def concat_and_chunk(articles):
    CHUNK_SIZE = 128
    # Concatenate all texts
    # concatenated_examples = {key: sum(articles[key], []) for key in articles.keys()}
    concatenated_examples = {key: [item for sublist in articles[key] for item in sublist] for key in articles.keys()}

    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(articles.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length//CHUNK_SIZE) * CHUNK_SIZE
    # Split by chunks of max_len
    chunked_articles = {key: [text[i : i+CHUNK_SIZE] for i in range(0, total_length, CHUNK_SIZE)] for key, text in concatenated_examples.items()}
    # Create a new labels column
    chunked_articles['labels'] = chunked_articles['input_ids']
    return chunked_articles
    
logger.info('Concatenating and chunking the datasets to a fixed length')
chunked_datasets = tokenized_datasets.map(concat_and_chunk, batched=True, num_proc=num_proc)
logger.info(f'Chunked datasets: {chunked_datasets}')

# Save chunked datasets to local disk (EBS volume)
logger.info(f'Saving chunked datasets to local disk {LOCAL_OUTPUT_PATH}')
chunked_datasets.save_to_disk(f'{LOCAL_OUTPUT_PATH}')

# Validate if datasets were saved correctly
logger.info('Validating if datasets were saved correctly')
reloaded_dataset = datasets.load_from_disk(f'{LOCAL_OUTPUT_PATH}')
logger.info(f'Reloaded dataset: {reloaded_dataset}')

2023-08-31 17:44:08,260 - __main__ - INFO - Evaluating custom tokenizer
2023-08-31 17:44:08,261 - __main__ - INFO - Test sentence: amazon simple storage service (amazon s3) is an object storage service that offers industry-leading scalability, data availability, security, and performance.
2023-08-31 17:44:08,262 - __main__ - INFO - Encoded sentence: [2, 137, 929, 338, 294, 10, 137, 203, 11, 180, 122, 254, 338, 294, 191, 932, 1871, 14, 2639, 2292, 13, 258, 602, 13, 587, 13, 131, 634, 15, 3]
2023-08-31 17:44:08,263 - __main__ - INFO - Token ID for token (s3) = s3
2023-08-31 17:44:08,264 - __main__ - INFO - Vocabulary size = 3136
2023-08-31 17:44:08,264 - __main__ - INFO - Reading and collating input data to create mini batches for Masked Language Model (MLM) training
2023-08-31 17:44:08,890 - __main__ - INFO - Dataset: Dataset({
    features: ['text'],
    num_rows: 223
})
2023-08-31 17:44:08,891 - __main__ - INFO - Splitting dataset into train and validation splits
2023-08-31 17:44:08,8

Map (num_proc=12): 100%|██████████| 200/200 [00:04<00:00, 46.13 examples/s]
Map (num_proc=12): 100%|██████████| 23/23 [00:04<00:00,  5.41 examples/s]

2023-08-31 17:44:17,973 - __main__ - INFO - Tokenized datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 23
    })
})
2023-08-31 17:44:17,974 - __main__ - INFO - Concatenating and chunking the datasets to a fixed length



Map (num_proc=12): 100%|██████████| 200/200 [00:03<00:00, 56.86 examples/s]
Map (num_proc=12): 100%|██████████| 23/23 [00:03<00:00,  6.54 examples/s]

2023-08-31 17:44:25,464 - __main__ - INFO - Chunked datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 177
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 18
    })
})
2023-08-31 17:44:25,465 - __main__ - INFO - Saving chunked datasets to local disk ./



Saving the dataset (1/1 shards): 100%|██████████| 177/177 [00:00<00:00, 17697.91 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 18/18 [00:00<00:00, 2575.83 examples/s]


2023-08-31 17:44:25,501 - __main__ - INFO - Validating if datasets were saved correctly
2023-08-31 17:44:25,544 - __main__ - INFO - Reloaded dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 177
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 18
    })
})
