In [1]:
from datasets import load_from_disk, ClassLabel
from transformers import AutoTokenizer

In [2]:
base_dataset = load_from_disk('./data/translated_dataset')
base_dataset = base_dataset.rename_column("label", "labels")
base_dataset = base_dataset.class_encode_column("labels")

Loading cached processed dataset at data/translated_dataset\cache-0c45efaa9837687b.arrow
Loading cached processed dataset at data/translated_dataset\cache-b290176a263dc347.arrow
Loading cached processed dataset at data/translated_dataset\cache-d7ff6d7f86dd767c.arrow


In [3]:
base_dataset = base_dataset.train_test_split(stratify_by_column='labels', seed=306, test_size=0.1)

Loading cached split indices for dataset at data/translated_dataset\cache-fc9934ac9315e141.arrow and data/translated_dataset\cache-c2ec7361a7f40877.arrow


In [4]:
def tokenize_sample(sample, tokenizer, text_col='text'):
    """Appends the result of tokenizing the specified text column to the sample."""
    tokenized = tokenizer(sample['text'])
    for k in tokenized:
        sample[k] = tokenized[k]
    return sample

## Model 1: roberta-tagalog-base (base text)

In [5]:
tl_tokenizer = AutoTokenizer.from_pretrained("jcblaise/roberta-tagalog-base")

In [6]:
def process_tl(sample):
    sample = {'text': sample['text']}
    sample = tokenize_sample(sample, tl_tokenizer)
    return sample

In [7]:
dataset_1 = base_dataset.map(process_tl, batched=True)
dataset_1 = dataset_1.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

Loading cached processed dataset at data/translated_dataset\cache-d8b03184ac0a1641.arrow


  0%|          | 0/7 [00:00<?, ?ba/s]

In [8]:
dataset_1.save_to_disk('data/dataset_1')

## Model 2: XLM-RoBERTa-base

In [9]:
xlm_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [10]:
xlm_tokenizer('<pad>')

{'input_ids': [0, 1, 2], 'attention_mask': [1, 1, 1]}

In [11]:
process_xlm = lambda sample: tokenize_sample(sample, xlm_tokenizer)

In [None]:
dataset_2 = base_dataset.map(process_xlm, batched=True)
dataset_2 = dataset_2.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

  0%|          | 0/55 [00:00<?, ?ba/s]

In [None]:
dataset_2.save_to_disk('data/dataset_2')

## Model 3: bert-base-uncased

In [None]:
en_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
process_en = lambda sample: tokenize_sample(sample, en_tokenizer)

In [None]:
dataset_3 = base_dataset.map(process_en, batched=True)
dataset_3 = dataset_3.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

In [None]:
dataset_3.save_to_disk('data/dataset_3')

## Model 4: Bilingual, Non-Translated

In [None]:
def tokenize_bilingual(sample, tl_tokenizer=tl_tokenizer, en_tokenizer=en_tokenizer, tl_col='text', en_col='text'):
    """Appends the result of tokenizing the specified text column to the sample."""
    tl_tokenized = tl_tokenizer(sample[tl_col])
    for k in tl_tokenized:
        sample[f'tl_{k}'] = tl_tokenized[k]
    en_tokenized = en_tokenizer(sample[en_col])
    for k in en_tokenized:
        sample[f'en_{k}'] = en_tokenized[k]
    return sample

In [None]:
dataset_4 = base_dataset.map(tokenize_bilingual, batched=True)
dataset_4 = dataset_4.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

In [None]:
dataset_4.save_to_disk('data/dataset_4')

## Model 5: Bilingual, Half-Translated

In [None]:
def half_translate(sample):
    if sample['lang'] == 'fil':
        sample = tokenize_bilingual(sample, en_col = 'en_translation')
    elif sample['lang'] == 'en':
        sample = tokenize_bilingual(sample, tl_col = 'tl_translation')
    else:
        sample = tokenize_bilingual(sample, en_col = 'en_translation', tl_col = 'tl_translation')
    return sample

In [None]:
dataset_5 = base_dataset.map(half_translate)
dataset_5 = dataset_5.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

In [None]:
dataset_5.save_to_disk('data/dataset_5')

## Model 6: Bilingual, Fully Translated

In [None]:
full_translate = lambda sample: tokenize_bilingual(sample, en_col = 'en_translation', tl_col = 'tl_translation')

In [None]:
dataset_6 = base_dataset.map(full_translate)
dataset_6 = dataset_6.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

In [None]:
dataset_6.save_to_disk('data/dataset_6')