In [30]:
import numpy as np
import pandas as pd
import pickle as pkl

import string
import spacy
import sklearn
import tqdm
import math

from datasets import Dataset
import joblib
import tqdm
from tqdm.notebook import tqdm_notebook
import os

# from nltk.tokenize import TreebankWordTokenizer, RegexpTokenizer
# from nltk.tokenize.casual import casual_tokenize
# from nltk.util import ngrams

punctuations = (string.punctuation + '“' + '…')

In [2]:
# from spacy.lang.en.stop_words import STOP_WORDS as en_stop_words
# from spacy.lang.tl.stop_words import STOP_WORDS as tl_stop_words
# for w in (['ba','eh','kasi','lang','mo','naman','opo','po','si','talaga','yung']):
#     tl_stop_words.add(w)

# stop_words = en_stop_words.union(tl_stop_words)

# Baseline Preproccessing

I also tried versions with stop word removal, lemmatization, etc. No real benefits with BERT models.

In [39]:
dfs = []
for fname in list(os.walk('data/hashtags'))[0][2]:
    dfs.append(pd.read_csv(f'data/hashtags/{fname}'))

In [54]:
data = pd.concat(dfs).drop_duplicates(subset=['id']).reset_index()

In [55]:
data = data[['tweet', 'hashtags']].copy()
data['hashtags'] = data['hashtags'].apply(eval)

In [56]:
# Used to view the tags
tags = data['hashtags'].explode().dropna().str.lower()

In [60]:
data['hashtags_lower'] = data['hashtags'].apply(lambda l: [s.lower() for s in l])

In [61]:
# Manually filter through the tags to flag positive and negative tags
anti_tags = ['lenikiko2022', 'kulayrosasangbukas', 'leniforpresident2022', 'lenikikoalltheway', 'kaylenitayo']
pro_tags = ['bbmsarauniteam', 'bbmismypresident2022', 'bbmsara2022', 'bringbackmarcos']

In [62]:
data['anti'] = data['hashtags_lower'].map(lambda l: any([s in anti_tags for s in l]))
data['pro'] = data['hashtags_lower'].map(lambda l: any([s in pro_tags for s in l]))

In [65]:
# Mark all of the tweets with only one tag affiliation, and re-mark pro and anti tweets as such
data['labeled'] = data['anti'] ^ data['pro']
data['anti'] = data['anti'] & data['labeled']
data['pro'] = data['pro'] & data['labeled']

In [66]:
labeled_data = data[data['labeled']]
labeled_data.head()

Unnamed: 0,tweet,hashtags,hashtags_lower,anti,pro,labeled
0,Ang tatay at nanay ko nga hindi din pumila kas...,"[bbmsarauniteam, bbmismypresident2022]","[bbmsarauniteam, bbmismypresident2022]",False,True,True
1,Hindi pa tapos ang eleksyon may sumipsip na. #...,[bbmismypresident2022],[bbmismypresident2022],False,True,True
2,E pumila din naman ang presidente ko #Halalan...,"[halalan2022, bbmsarauniteam, bbmismypresident...","[halalan2022, bbmsarauniteam, bbmismypresident...",False,True,True
3,Titindi ang Labanan sa Pagitan nang ❤️💚 at 💛🌺 ...,"[votewisely2022, uniteam, bbmsarauniteam, bbmi...","[votewisely2022, uniteam, bbmsarauniteam, bbmi...",False,True,True
4,#Halalan2022 #BBMIsMyPresident2022,"[halalan2022, bbmismypresident2022]","[halalan2022, bbmismypresident2022]",False,True,True


In [67]:
labeled_data['processed'] = labeled_data['tweet'].str.replace('\s', ' ').str.replace(r'#\w*', '').str.replace(r'https?://\S+', "")
labeled_data['label'] = labeled_data['pro'].astype(int)

  labeled_data['processed'] = labeled_data['tweet'].str.replace('\s', ' ').str.replace(r'#\w*', '').str.replace(r'https?://\S+', "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['processed'] = labeled_data['tweet'].str.replace('\s', ' ').str.replace(r'#\w*', '').str.replace(r'https?://\S+', "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['label'] = labeled_data['pro'].astype(int)


In [83]:
labeled_data = labeled_data.drop_duplicates(subset='processed')

In [85]:
raw_dataset = Dataset.from_dict({
    'text': labeled_data['processed'],
    'label': labeled_data['label']
})

In [86]:
joblib.dump(raw_dataset, 'raw_dataset.pkl')

['raw_dataset.pkl']

# Language Detection and Dual Translation

In [97]:
import six
from google.cloud import translate_v2 as translate
from google.api_core.exceptions import ServiceUnavailable

In [98]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/your/credentials'
os.path.isfile(os.environ['GOOGLE_APPLICATION_CREDENTIALS'])

True

In [99]:
raw_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 61025
})

In [100]:
def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    
    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language=target)

#     print(u"Text: {}".format(result["input"]))
#     print(u"Translation: {}".format(result["translatedText"]))
#     print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
    return result

In [101]:
def add_en_translation_and_lang(sample):
    try:
        text = sample['text']
        res = translate_text('en', text)
        sample['en_translation'] = res['translatedText']
        sample['lang'] = res["detectedSourceLanguage"]
    except ServiceUnavailable:
        sample['en_translation'] = None
        sample['lang'] = None
    except Exception as e:
        sample['en_translation'] = e
        sample['lang'] = e
    return sample

In [102]:
en_translated_dataset = raw_dataset.map(add_en_translation_and_lang)

  0%|          | 0/61025 [00:00<?, ?ex/s]

In [133]:
def fix_en_translation(sample):
    if sample['lang'] is None:
        sample = add_en_translation_and_lang(sample)
    return sample

In [134]:
en_translated_dataset = en_translated_dataset.map(fix_en_translation)

  0%|          | 0/61025 [00:00<?, ?ex/s]

In [139]:
def add_tl_translation(sample):
    try:
        if sample['lang'] =='fil':
            sample['tl_translation'] = translate_text('tl', sample['en_translation'])['translatedText']
        else:
            sample['tl_translation'] = translate_text('tl', sample['text'])['translatedText']
    except ServiceUnavailable:
        sample['tl_translation'] = None
    except Exception as e:
        sample['tl_translation'] = 0
    return sample

In [140]:
dual_translated_dataset = en_translated_dataset.map(add_tl_translation)

  0%|          | 0/61025 [00:00<?, ?ex/s]

In [158]:
exceptions = [bool(not v) for v in dual_translated_dataset['tl_translation']]

In [161]:
sel_dataset = dual_translated_dataset.select([i for i in range(len(dual_translated_dataset)) if i not in np.nonzero(exceptions)[0]])

In [162]:
sel_dataset.save_to_disk("data/translated_dataset")

Flattening the indices:   0%|          | 0/62 [00:00<?, ?ba/s]

In [163]:
joblib.dump(sel_dataset, 'data/translated_dataset.pkl')

['data/translated_dataset.pkl']

## (Deprecated) Dual Translation with Translate API

In [None]:
len(''.join(raw_dataset['text']))

In [None]:
from langdetect import detect, detect_langs, LangDetectException
import langdetect

In [None]:
def sort_by_language(txts, labels):
    en = []
    tl = []
    na = []
    for txt, label in tqdm.notebook.tqdm_notebook(zip(txts, labels), total=len(txts)):
        timer = 10
        found = False
        while timer:
            try:
                langs = detect_langs(txt)
            except LangDetectException:
                break
            for l in langs:
                if l.lang == 'en':
                    en.append((txt, label))
                    found = True
                    break
                elif l.lang == 'tl':
                    tl.append((txt, label))
                    found = True
                    break
            if found:
                break
            timer -= 1
        if not found:
            na.append((txt, label))
    return {
        'en': tuple(zip(*en)), 'tl': tuple(zip(*tl)), 'na': tuple(zip(*na))
    }

In [None]:
data_sorted = sort_by_language(labeled_data['detagged'], labeled_data['label'])

In [None]:
for lang in data_sorted:
    data_sorted[lang] = {
        'text': data_sorted[lang][0],
        'labels': data_sorted[lang][1]
    }
    data_sorted[lang] = Dataset.from_dict(data_sorted[lang])

In [None]:
import copy
data = copy.deepcopy(data_sorted)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tl_tokenizer = AutoTokenizer.from_pretrained("jcblaise/roberta-tagalog-base")
en_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tl_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tl-en")
tl_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-tl-en")
en_tl_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-tl")
en_tl_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-tl")

In [None]:
from transformers import DataCollatorForSeq2Seq
en_tl_coll = DataCollatorForSeq2Seq(tokenizer=en_tl_tokenizer, model=en_tl_model)
tl_en_coll = DataCollatorForSeq2Seq(tokenizer=tl_en_tokenizer, model=tl_en_model)

In [None]:
def tokenize_function_en_tl(examples):
    return en_tl_tokenizer(examples['text'])
def tokenize_function_tl_en(examples):
    return tl_en_tokenizer(examples['text'])

In [None]:
def func_tl_en(data, batch_size=16):
    tokenized = data.map(tokenize_function_tl_en, remove_columns='text')
    tokenized = tokenized.remove_columns(['labels'])
    
    translated = []
    for start in tqdm_notebook(range(0, len(data), batch_size)):
        batch = [tokenized[i] for i in range(start, min(start+batch_size, len(data)))]
        collated = tl_en_coll(batch).to('cuda')
        out = tl_en_model.generate(**collated)
        translated.append(tl_en_tokenizer.batch_decode(out, skip_special_tokens=True))
    
    return translated

In [None]:
def func(data, tokenizer, coll, model, batch_size=16):
    def tok_func(examples):
        return tokenizer(examples['text'])
    
    tokenized = data.map(tok_func, remove_columns='text')
    tokenized = tokenized.remove_columns(['labels'])
    
    translated = []
    for start in tqdm_notebook(range(0, len(data), batch_size)):
        batch = [tokenized[i] for i in range(start, min(start+batch_size, len(data)))]
        collated = coll(batch).to('cuda')
        out = model.generate(**collated)
        translated.append(tokenizer.batch_decode(out, skip_special_tokens=True))
    
    return translated

In [None]:
en_tl_model.to('cpu')
na_en_translated = func(data['na'], tl_en_tokenizer, tl_en_coll, tl_en_model)
tl_en_model.to('cpu')
en_tl_model.to('cuda')
na_tl_translated = func(data['na'], en_tl_tokenizer, en_tl_coll, en_tl_model)

In [None]:
en_translated = sum(en_translated, [])

In [None]:
data['na'] = data['na']

In [None]:
en_translated = func_tl_en(data['tl'])

## Finishing

This is a pretty small dataset, so I'll do a 90/10 split

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    labeled_data['detagged'], labeled_data['label'], test_size=0.1, random_state=306)

In [None]:
import joblib
joblib.dump(X_train, './data/X_train.pkl')
joblib.dump(X_test, './data/X_test.pkl')
joblib.dump(y_train, './data/y_train.pkl')
joblib.dump(y_test, './data/y_test.pkl')

# Other Preprocessing Tricks

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS as en_stop_words
from spacy.lang.tl.stop_words import STOP_WORDS as tl_stop_words
for w in (['ba','eh','kasi','lang','mo','naman','opo','po','si','talaga','yung']):
    tl_stop_words.add(w)


stop_words = en_stop_words.union(tl_stop_words)

In [None]:
text['sw_removed'] = text['tokenized'].apply(lambda l: [x for x in l if x not in stop_words])

In [None]:
text['sw_removed']