In [1]:
import pandas as pd
from textattack.augmentation import WordNetAugmenter, EmbeddingAugmenter, CharSwapAugmenter, EasyDataAugmenter, CheckListAugmenter, CLAREAugmenter, BackTranslationAugmenter
import random
import re
import stanza

In [2]:
def load_data(input_file, text_column, label_column, sample_size=1000):
    """Load and sample the dataset."""
    data = pd.read_csv(input_file)
    data = data.sample(sample_size, random_state=42)
    texts = data[text_column].tolist()
    labels = data[label_column].tolist()
    return data, texts, labels


In [3]:
def format_text(text):
    """Ensure proper spacing between words in the augmented text."""
    formatted_text = re.sub(r'([a-zA-Z0-9])([A-Z])', r'\1 \2', text)
    formatted_text = " ".join(formatted_text.split())  # Remove extra spaces
    return formatted_text

In [4]:
def load_stanza_model():
    """Load Stanza pipeline from a custom directory."""
    try:
        nlp = stanza.Pipeline(lang='en', dir=r'C:\Users\edwin victor\stanza_resources\en')
        return nlp
    except Exception as e:
        print(f"Error loading Stanza model: {e}")
        return None

In [5]:
def augment_with_wordnet(texts, labels, num_rows):
    """Augment texts using WordNetAugmenter."""
    augmenter = WordNetAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

def augment_with_embedding(texts, labels, num_rows):
    """Augment texts using EmbeddingAugmenter."""
    augmenter = EmbeddingAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

def augment_with_charswap(texts, labels, num_rows):
    """Augment texts using CharSwapAugmenter."""
    augmenter = CharSwapAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

def augment_with_easydata(texts, labels, num_rows):
    """Augment texts using EasyDataAugmenter."""
    augmenter = EasyDataAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

def augment_with_checklist(texts, labels, num_rows):
    """Augment texts using CheckListAugmenter."""
    augmenter = CheckListAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

In [6]:
def augment_with_clare(texts, labels, num_rows):
    """Augment texts using CLAREAugmenter with Stanza."""
    nlp = load_stanza_model()
    if nlp is None:
        return [], []  
    
    augmented_texts = []
    augmented_labels = []
    
    for text, label in zip(texts, labels):
        augmented_text = text
        
        doc = nlp(text)
        
        for sentence in doc.sentences:
            for word in sentence.words:
                if word.upos == 'NOUN':  
                    augmented_text = augmented_text.replace(word.text, word.text + "_augmented")
        
        augmented_texts.append(augmented_text)
        augmented_labels.append(label)
    
    return augmented_texts, augmented_labels

In [7]:
def save_augmented_data(final_data, output_file):
    """Save the augmented dataset to a CSV file."""
    try:
        final_data.to_csv(output_file, index=False)
        print(f"Augmented data saved to {output_file}")
    except Exception as e:
        print(f"Error saving augmented data: {e}")

In [8]:
def augment_texts(texts, labels, augmenter, num_rows):
    """Apply a given augmenter to generate a specified number of rows."""
    augmented_texts = []
    augmented_labels = []
    
    for i in range(num_rows):
        text, label = texts[i % len(texts)], labels[i % len(labels)]
        try:
            augmented_text = augmenter.augment(text)
            if isinstance(augmented_text, list):
                augmented_text = " ".join(augmented_text)
            augmented_text = format_text(augmented_text)
            augmented_texts.append(augmented_text)
            augmented_labels.append(label)
        except Exception as e:
            print(f"Augmentation error: {e}")
    
    return augmented_texts, augmented_labels

In [9]:
def augment_dataset(input_file, text_column, label_column, output_file, augmenter_targets):
    """Main function to augment dataset with multiple augmenters."""
    data, texts, labels = load_data(input_file, text_column, label_column)
    
    final_texts = texts[:]
    final_labels = labels[:]
    
    for augmenter_name, (augment_function, num_rows) in augmenter_targets.items():
        print(f"Applying {augmenter_name} to generate {num_rows} rows...")
        augmented_texts, augmented_labels = augment_function(texts, labels, num_rows)
        final_texts.extend(augmented_texts)
        final_labels.extend(augmented_labels)
    
    final_data = pd.DataFrame({text_column: final_texts, label_column: final_labels})
    
    save_augmented_data(final_data, output_file)

In [10]:
augmenter_targets = {
    "WordNetAugmenter": (augment_with_wordnet, 2000),
    "EmbeddingAugmenter": (augment_with_embedding, 2000),
    "CharSwapAugmenter": (augment_with_charswap, 2000),
    "EasyDataAugmenter": (augment_with_easydata, 1000),
    "CheckListAugmenter": (augment_with_checklist, 2000),
}

In [12]:
augment_dataset(
    input_file=r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data_3.csv",
    text_column="corrected_tweet",
    label_column="class",
    output_file=r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data_4.csv",
    augmenter_targets=augmenter_targets
)

Applying WordNetAugmenter to generate 2000 rows...


[nltk_data] Downloading package omw-1.4 to C:\Users\edwin
[nltk_data]     victor\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applying EmbeddingAugmenter to generate 2000 rows...
Applying CharSwapAugmenter to generate 2000 rows...
Applying EasyDataAugmenter to generate 1000 rows...


[nltk_data] Downloading package omw-1.4 to C:\Users\edwin
[nltk_data]     victor\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applying CheckListAugmenter to generate 2000 rows...
2025-01-21 11:18:09,354 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Augmented data saved to D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data_4.csv
