In [3]:
!mkdir -p fasttext
!curl -L -o fasttext/cc.vi.300.vec.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz
!gunzip fasttext/cc.vi.300.vec.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1177M  100 1177M    0     0   277M      0  0:00:04  0:00:04 --:--:--  284M


In [4]:
from gensim.models.keyedvectors import KeyedVectors

model = KeyedVectors.load_word2vec_format('./fasttext/cc.vi.300.vec', binary=False)

In [5]:
model.most_similar('dốt', topn=10)

[('Dốt', 0.6049689054489136),
 ('ngu', 0.5371081233024597),
 ('Đếck', 0.5134807229042053),
 ('giỏi', 0.502494215965271),
 ('dốt', 0.4523274302482605),
 ('học', 0.44529086351394653),
 ('xuẫn', 0.430868536233902),
 ('ngờHỏi', 0.4276396334171295),
 ('ignorant', 0.421291321516037),
 ('đoảng', 0.419640451669693)]

In [2]:
import random
import re

import pandas as pd
from datasets import load_dataset

In [3]:
dataset = load_dataset('sonlam1102/vihsd')
dataset

DatasetDict({
    train: Dataset({
        features: ['free_text', 'label_id'],
        num_rows: 24048
    })
    validation: Dataset({
        features: ['free_text', 'label_id'],
        num_rows: 2672
    })
    test: Dataset({
        features: ['free_text', 'label_id'],
        num_rows: 6680
    })
})

In [4]:
train = pd.DataFrame(dataset['train'])

clean_data = train[train['label_id'] == 0]
offensive_data = train[train['label_id'] == 1]
hate_data = train[train['label_id'] == 2]

print(f"Number of clean samples: {len(clean_data)}")
print(f"Number of offensive samples: {len(offensive_data)}")
print(f"Number of hate samples: {len(hate_data)}")

Number of clean samples: 19886
Number of offensive samples: 1606
Number of hate samples: 2556


In [10]:
stop_words = []

diacritic_variants = {
    'a': ['á', 'à', 'ả', 'ã', 'ạ', 'ă', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'â', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ'],
    'e': ['é', 'è', 'ẻ', 'ẽ', 'ẹ', 'ê', 'ế', 'ề', 'ể', 'ễ', 'ệ'],
    'i': ['í', 'ì', 'ỉ', 'ĩ', 'ị'],
    'o': ['ó', 'ò', 'ỏ', 'õ', 'ọ', 'ô', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ơ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ'],
    'u': ['ú', 'ù', 'ủ', 'ũ', 'ụ', 'ư', 'ứ', 'ừ', 'ử', 'ữ', 'ự'],
    'y': ['ý', 'ỳ', 'ỷ', 'ỹ', 'ỵ'],
    'd': ['đ']
}

reverse_diacritic_variants = {v: k for k, variants in diacritic_variants.items() for v in variants }


def synonym_replacement(parts: list[str], n: int = 1) -> list[str]:
    new_parts = parts.copy()
    words = [
        (i, word)
        for i, word in enumerate(new_parts)
        if word.isalnum() and word not in stop_words
    ]
    random.shuffle(words)

    num_replaced = 0
    for i, word in words:
        synonyms = _get_synonyms(word)
        if len(synonyms) == 0:
            continue

        new_parts[i] = random.choice(synonyms)
        num_replaced += 1

        if num_replaced >= n:
            break

    return new_parts


def _get_synonyms(word: str) -> list[str]:
    similar_words = model.most_similar(word)
    return [word for word, score in similar_words if score > 0.5]


def random_insertion(parts: list[str], n: int = 1) -> list[str]:
    new_parts = parts.copy()
    for _ in range(n):
        new_parts = _add_random_synonym(new_parts)
    return get_parts(''.join(new_parts))


def _add_random_synonym(parts: list[str]) -> list[str]:
    words = [word for word in parts if word.isalnum()]

    synonyms = []
    counter = 0
    while len(synonyms) < 1 and len(words) > 0:
        random_word = random.choice(words)

        try:
            synonyms = _get_synonyms(random_word)
        except Exception as e:
            print(e)

        counter += 1
        if counter >= 10:
            return

    if len(words) > 0:
        random_synonym = synonyms[0]
        random_idx = random.randint(0, len(words) - 1)
        parts.insert(random_idx, ' ')
        parts.insert(random_idx, random_synonym)
        parts.insert(random_idx, ' ')
    
    return parts


def random_swap(parts: list[str], n: int = 1) -> list[str]:
    new_parts = parts.copy()
    ids = [i for i, token in enumerate(new_parts) if not token.isspace()]
    for _ in range(n):
        if len(ids) >= 2:
            id_1, id_2 = random.sample(ids, 2)
            new_parts[id_1], new_parts[id_2] = new_parts[id_2], new_parts[id_1]
    return new_parts


def random_deletion(parts: list[str], p: float = 0.1) -> list[str]:
    if len(parts) <= 1:
        return parts

    new_parts = []
    for part in parts:
        if random.uniform(0, 1) < p and not part.isspace():
            continue
        new_parts.append(part)

    if len(new_parts) == 0:
        rand_int = random.randint(0, len(parts) - 1)
        return parts[rand_int]

    return new_parts


def noise_replacement(parts: list[str], noise_level: float = 0.2) -> list[str]:
    new_parts = parts.copy()

    words = [(i, word) for i, word in enumerate(new_parts) if word.isalnum()]
    random.shuffle(words)

    num_words_to_change = int(len(words) * noise_level)

    for _ in range(num_words_to_change):
        idx, word = random.choice(words)
        char_list = list(word)

        for i, char in enumerate(char_list):
            if char.lower() in reverse_diacritic_variants:
                new_char = reverse_diacritic_variants[char.lower()]
                new_char = new_char.upper() if char.isupper() else new_char
                char_list[i] = new_char

        new_parts[idx] = ''.join(char_list)

    return new_parts


pattern = re.compile(r'\w+|\s+|[^\w\s]+')

def get_parts(text: str) -> list[str]:
    return pattern.findall(text)


def get_augment_data(
    data: pd.DataFrame,
    min_words: int = 4,
    num_aug: int = 4,
    alpha_sr: float = 0.1,
    alpha_ri: float = 0.1,
    alpha_rs: float = 0.1,
    p_rd: float = 0.1,
    p_nr: float = 0.2,
    num_methods: int = 3,
) -> list[str]:
    augmented_data = pd.DataFrame(columns=['free_text', 'label_id'])
    augmented_data = augmented_data.loc[:100]

    def get_augmented_texts(text: str) -> list[str]:
        parts = get_parts(str(text))

        num_words = len([part for part in parts if part.isalnum()])
        num_tokens = len([part for part in parts if not part.isspace()])
        
        if num_words < min_words:
            return []

        n_sr = max(1, int(alpha_sr * num_words))
        n_ri = max(1, int(alpha_ri * num_tokens))
        n_rs = max(1, int(alpha_rs * num_tokens))

        methods = [
            lambda x: synonym_replacement(x, n_sr),
            lambda x: random_insertion(x, n_ri),
            lambda x: random_swap(x, n_rs),
            lambda x: random_deletion(x, p_rd),
            lambda x: noise_replacement(x, p_nr),
        ]

        augmented_texts = []

        for _ in range(num_aug):
            new_parts = parts.copy()
            for method in random.choices(methods, k=num_methods):
                try:
                    new_parts = method(new_parts)
                except Exception as e:
                    print(e)
                    pass

            new_parts = [p for p in ''.join(new_parts).split(' ') if p.strip()]
            augmented_text = ' '.join(new_parts)
            augmented_texts.append(augmented_text)

        return augmented_texts

    for _, row in data.iterrows():
        text = row['free_text']
        label = row['label_id']
        augmented_texts = get_augmented_texts(text)
        for augmented_text in augmented_texts:
            augmented_data.loc[len(augmented_data)] = [augmented_text, label]

    return augmented_data

In [51]:
count = 0
for text in offensive_data['free_text'].values:
    parts = get_parts(str(text))
    words = [word for word in parts if word.isalnum()]
    if len(words) >= 4:
        count += 1

count

1269

In [52]:
!mkdir -p data

In [53]:
aug_offensive = get_augment_data(
    data=offensive_data,
    min_words=4,
    num_aug=4,
    alpha_sr=0.3,
    alpha_ri=0.3,
    alpha_rs=0.3,
    p_rd=0.3,
    p_nr=0.3,
    num_methods=3,
)
aug_offensive.to_csv('data/aug_offensive.csv', index=False, sep='|')

"Key 'clmn' not present in vocabulary"
"Key 'cmne' not present in vocabulary"
"Key 'pónk' not present in vocabulary"
"Key 'cljv' not present in vocabulary"
"Key 'cđb' not present in vocabulary"
"Key 'cđb' not present in vocabulary"
"Key 'ưiii' not present in vocabulary"
"Key 'đcđ' not present in vocabulary"
"Key 'vailon' not present in vocabulary"
"Key 'busss' not present in vocabulary"
"Key 'Đr' not present in vocabulary"
"Key 'Đr' not present in vocabulary"
"Key 'Đr' not present in vocabulary"
"Key 'lồnn' not present in vocabulary"
"Key 'lồnn' not present in vocabulary"
"Key 'vailon' not present in vocabulary"
"Key 'yz700' not present in vocabulary"
"Key 'yz700' not present in vocabulary"
"Key 'hóa' not present in vocabulary"
"Key 'Thúy' not present in vocabulary"
"Key 'vkllllllllll' not present in vocabulary"
"Key 'thuêtj' not present in vocabulary"
"Key 'khỏe' not present in vocabulary"
"Key 'llon' not present in vocabulary"
"Key 'lonnn' not present in vocabulary"
"Key 'Đeos' not p

In [54]:
count = 0
for text in hate_data['free_text'].values:
    parts = get_parts(str(text))
    words = [word for word in parts if word.isalnum()]
    if len(words) >= 10:
        count += 1

count

1722

In [58]:
aug_hate = get_augment_data(
    data=hate_data,
    min_words=10,
    num_aug=3,
    alpha_sr=0.3,
    alpha_ri=0.3,
    alpha_rs=0.3,
    p_rd=0.3,
    p_nr=0.3,
    num_methods=3,
)
aug_hate.to_csv('data/aug_hate.csv', index=False, sep='|')

"Key 'trụy' not present in vocabulary"
"Key 'NĢOC' not present in vocabulary"
"Key 'lozzz' not present in vocabulary"
"Key 'đmcs' not present in vocabulary"
"Key 'Tnao' not present in vocabulary"
"Key 'covid' not present in vocabulary"
"Key 'vayy' not present in vocabulary"
"Key 'NÍ' not present in vocabulary"
"Key 'NÍ' not present in vocabulary"
"Key 'hóa' not present in vocabulary"
"Key 'NguCo' not present in vocabulary"
"Key 'Dlv' not present in vocabulary"
"Key 'Dlv' not present in vocabulary"
"Key '10000usd' not present in vocabulary"
"Key 'Comcom' not present in vocabulary"
"Key 'coviD' not present in vocabulary"
"Key 'khóa' not present in vocabulary"
"Key 'Hyakuya' not present in vocabulary"
"Key 'lozzzz' not present in vocabulary"
"Key 'lozzzz' not present in vocabulary"
"Key 'khỏe' not present in vocabulary"
"Key 'Trungg' not present in vocabulary"
"Key 'Covid' not present in vocabulary"
"Key 'Khug' not present in vocabulary"
"Key 'dlike' not present in vocabulary"
"Key 'Ngụy'