In [1]:
from datasets import load_from_disk, ClassLabel
from transformers import AutoTokenizer

In [2]:
base_dataset = load_from_disk('./data/full_translated_dataset')
base_dataset = base_dataset.rename_column("label", "labels")
base_dataset = base_dataset.class_encode_column("labels")



Stringifying the column:   0%|          | 0/64 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/64 [00:00<?, ?ba/s]

In [3]:
base_dataset = base_dataset.train_test_split(stratify_by_column='labels', seed=306, test_size=0.1)

In [4]:
def tokenize_sample(sample, tokenizer, text_col='text'):
    """Appends the result of tokenizing the specified text column to the sample."""
    tokenized = tokenizer(sample['text'])
    for k in tokenized:
        sample[k] = tokenized[k]
    return sample

## Model 1: roberta-tagalog-base (base text)

In [5]:
tl_tokenizer = AutoTokenizer.from_pretrained("jcblaise/roberta-tagalog-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/472k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/272k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/804k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [6]:
def process_tl(sample):
    sample = {'text': sample['text']}
    sample = tokenize_sample(sample, tl_tokenizer)
    return sample

In [10]:
dataset_1 = base_dataset.map(process_tl, batched=True)
# dataset_1 = dataset_1.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

Loading cached processed dataset at E:\Projects\Elections\data\full_translated_dataset\cache-a9a2e56f6b89246f.arrow
Loading cached processed dataset at E:\Projects\Elections\data\full_translated_dataset\cache-d38c05faebcbbcfd.arrow


In [11]:
dataset_1.save_to_disk('data/dataset_1')

Saving the dataset (0/1 shards):   0%|          | 0/57401 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6378 [00:00<?, ? examples/s]

## Model 2: XLM-RoBERTa-base

In [12]:
xlm_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [13]:
process_xlm = lambda sample: tokenize_sample(sample, xlm_tokenizer)

In [14]:
dataset_2 = base_dataset.map(process_xlm, batched=True)
# dataset_2 = dataset_2.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

  0%|          | 0/58 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [15]:
dataset_2.save_to_disk('data/dataset_2')

Saving the dataset (0/1 shards):   0%|          | 0/57401 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6378 [00:00<?, ? examples/s]

## Model 3: bert-base-uncased

In [16]:
en_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [17]:
process_en = lambda sample: tokenize_sample(sample, en_tokenizer)

In [18]:
dataset_3 = base_dataset.map(process_en, batched=True)
# dataset_3 = dataset_3.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

  0%|          | 0/58 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [19]:
dataset_3.save_to_disk('data/dataset_3')

Saving the dataset (0/1 shards):   0%|          | 0/57401 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6378 [00:00<?, ? examples/s]

## Model 4: Bilingual, Non-Translated

In [20]:
def tokenize_bilingual(sample, tl_tokenizer=tl_tokenizer, en_tokenizer=en_tokenizer, tl_col='text', en_col='text'):
    """Appends the result of tokenizing the specified text column to the sample."""
    tl_tokenized = tl_tokenizer(sample[tl_col])
    for k in tl_tokenized:
        sample[f'tl_{k}'] = tl_tokenized[k]
    en_tokenized = en_tokenizer(sample[en_col])
    for k in en_tokenized:
        sample[f'en_{k}'] = en_tokenized[k]
    return sample

In [21]:
dataset_4 = base_dataset.map(tokenize_bilingual, batched=True)
# dataset_4 = dataset_4.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

  0%|          | 0/58 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [22]:
dataset_4.save_to_disk('data/dataset_4')

Saving the dataset (0/1 shards):   0%|          | 0/57401 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6378 [00:00<?, ? examples/s]

## Model 5: Bilingual, Half-Translated

In [23]:
def half_translate(sample):
    if sample['lang'] == 'fil':
        sample = tokenize_bilingual(sample, en_col = 'en_translation')
    elif sample['lang'] == 'en':
        sample = tokenize_bilingual(sample, tl_col = 'tl_translation')
    else:
        sample = tokenize_bilingual(sample, en_col = 'en_translation', tl_col = 'tl_translation')
    return sample

In [24]:
dataset_5 = base_dataset.map(half_translate)
# dataset_5 = dataset_5.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

  0%|          | 0/57401 [00:00<?, ?ex/s]

  0%|          | 0/6378 [00:00<?, ?ex/s]

In [25]:
dataset_5.save_to_disk('data/dataset_5')

Saving the dataset (0/1 shards):   0%|          | 0/57401 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6378 [00:00<?, ? examples/s]

## Model 6: Bilingual, Fully Translated

In [26]:
full_translate = lambda sample: tokenize_bilingual(sample, en_col = 'en_translation', tl_col = 'tl_translation')

In [27]:
dataset_6 = base_dataset.map(full_translate)
# dataset_6 = dataset_6.remove_columns(['text', 'en_translation', 'lang', 'tl_translation'])

  0%|          | 0/57401 [00:00<?, ?ex/s]

  0%|          | 0/6378 [00:00<?, ?ex/s]

In [28]:
dataset_6.save_to_disk('data/dataset_6')

Saving the dataset (0/1 shards):   0%|          | 0/57401 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6378 [00:00<?, ? examples/s]