In [17]:
import pandas as pd
from textattack.augmentation import WordNetAugmenter, EmbeddingAugmenter, CharSwapAugmenter, EasyDataAugmenter, CheckListAugmenter, CLAREAugmenter, BackTranslationAugmenter
from textattack.transformations import WordS
import random
import re
import stanza

In [2]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

Downloaded file to C:\Users\edwin victor\stanza_resources\resources.json
Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

Downloaded file to C:\Users\edwin victor\stanza_resources\en\default.zip
Finished downloading models and saved to C:\Users\edwin victor\stanza_resources


In [19]:
stanza.download('fr')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

Downloaded file to C:\Users\edwin victor\stanza_resources\resources.json
Downloading default packages for language: fr (French) ...


Downloading https://huggingface.co/stanfordnlp/stanza-fr/resolve/v1.10.0/models/default.zip:   0%|          | …

Downloaded file to C:\Users\edwin victor\stanza_resources\fr\default.zip
Finished downloading models and saved to C:\Users\edwin victor\stanza_resources


In [3]:
def load_data(input_file, text_column, label_column, sample_size=1000):
    """Load and sample the dataset."""
    data = pd.read_csv(input_file)
    data = data.sample(sample_size, random_state=42)
    texts = data[text_column].tolist()
    labels = data[label_column].tolist()
    return data, texts, labels

In [4]:
def format_text(text):
    """Ensure proper spacing between words in the augmented text."""
    formatted_text = re.sub(r'([a-zA-Z0-9])([A-Z])', r'\1 \2', text)
    formatted_text = " ".join(formatted_text.split())  # Remove extra spaces
    return formatted_text

In [5]:
def augment_with_wordnet(texts, labels, num_rows):
    """Augment texts using WordNetAugmenter."""
    augmenter = WordNetAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

In [6]:
def augment_with_embedding(texts, labels, num_rows):
    """Augment texts using EmbeddingAugmenter."""
    augmenter = EmbeddingAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)


In [7]:
def augment_with_charswap(texts, labels, num_rows):
    """Augment texts using CharSwapAugmenter."""
    augmenter = CharSwapAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

In [8]:
def augment_with_easydata(texts, labels, num_rows):
    """Augment texts using EasyDataAugmenter."""
    augmenter = EasyDataAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

In [9]:
def augment_with_checklist(texts, labels, num_rows):
    """Augment texts using CheckListAugmenter."""
    augmenter = CheckListAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

In [10]:
def augment_with_clare(texts, labels, num_rows):
    """Augment texts using CLAREAugmenter."""
    augmenter = CLAREAugmenter()
    return augment_texts(texts, labels, augmenter, num_rows)

In [18]:
def augment_with_back_translator(texts, labels, num_rows):
    augmenter = BackTranslationAugmenter(from_lang='en', to_lang='fr') 
    return augment_texts(texts, labels, augmenter, num_rows)

In [12]:
def augment_texts(texts, labels, augmenter, num_rows):
    """Apply a given augmenter to generate a specified number of rows."""
    augmented_texts = []
    augmented_labels = []
    
    for i in range(num_rows):
        text, label = texts[i % len(texts)], labels[i % len(labels)]
        try:
            augmented_text = augmenter.augment(text)
            if isinstance(augmented_text, list):
                augmented_text = " ".join(augmented_text)
            augmented_text = format_text(augmented_text)
            augmented_texts.append(augmented_text)
            augmented_labels.append(label)
        except Exception as e:
            print(f"Augmentation error: {e}")
    
    return augmented_texts, augmented_labels



In [28]:
def augment_texts(texts, labels, augmenter, num_rows):
    """Apply a given augmenter to generate a specified number of rows."""
    augmented_texts = []
    augmented_labels = []
    
    for i in range(num_rows):
        text, label = texts[i % len(texts)], labels[i % len(labels)]
        try:
            augmented_text = augmenter.augment(text)
            # Ensure that the augmented text is not repeated
            if isinstance(augmented_text, list):
                augmented_text = " ".join(augmented_text)
            augmented_text = format_text(augmented_text)
            
            if augmented_text not in augmented_texts:  # Prevent repetition
                augmented_texts.append(augmented_text)
                augmented_labels.append(label)
        except Exception as e:
            print(f"Augmentation error: {e}")
    
    return augmented_texts, augmented_labels

In [26]:
def save_augmented_data(final_data, output_file):
    """Save the augmented dataset to a CSV file."""
    try:
        final_data.to_csv(output_file, index=False)
        print(f"Augmented data saved to {output_file}")
    except Exception as e:
        print(f"Error saving augmented data: {e}")

In [20]:
def augment_dataset(input_file, text_column, label_column, output_file, augmenter_targets):
    """Main function to augment dataset with multiple augmenters."""
    # Load data
    data, texts, labels = load_data(input_file, text_column, label_column)
    
    final_texts = texts[:]
    final_labels = labels[:]
    
    # Apply each augmenter and combine the results
    for augmenter_name, (augment_function, num_rows) in augmenter_targets.items():
        print(f"Applying {augmenter_name} to generate {num_rows} rows...")
        augmented_texts, augmented_labels = augment_function(texts, labels, num_rows)
        final_texts.extend(augmented_texts)
        final_labels.extend(augmented_labels)
    
    # Create final DataFrame
    final_data = pd.DataFrame({text_column: final_texts, label_column: final_labels})
    
    # Save the augmented dataset
    save_augmented_data(final_data, output_file)

In [24]:
# Define augmenters and number of rows for each
augmenter_targets = {
    "WordNetAugmenter": (augment_with_wordnet, 300),
    "EmbeddingAugmenter": (augment_with_embedding, 300),
    "CharSwapAugmenter": (augment_with_charswap, 200),
    "EasyDataAugmenter": (augment_with_easydata, 300),
    "CheckListAugmenter": (augment_with_checklist, 200),
    
}


In [30]:
# Run augmentation
augment_dataset(
    input_file=r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\cleaned_dataset\labeled_data_cleaned.csv",
    text_column="corrected_tweet",
    label_column="class",
    output_file=r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data_1.1.csv",
    augmenter_targets=augmenter_targets
)


Applying WordNetAugmenter to generate 300 rows...


[nltk_data] Downloading package omw-1.4 to C:\Users\edwin
[nltk_data]     victor\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applying EmbeddingAugmenter to generate 300 rows...
Applying CharSwapAugmenter to generate 200 rows...
Applying EasyDataAugmenter to generate 300 rows...


[nltk_data] Downloading package omw-1.4 to C:\Users\edwin
[nltk_data]     victor\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applying CheckListAugmenter to generate 200 rows...
Augmented data saved to D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data_1.1.csv


In [16]:
import stanza
stanza.download('en')  # Ensure the model is downloaded

nlp = stanza.Pipeline('en')
doc = nlp("This is a test sentence.")
for sentence in doc.sentences:
    for word in sentence.words:
        print(f"word: {word.text}, upos: {word.upos}")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

Downloaded file to C:\Users\edwin victor\stanza_resources\resources.json
Downloading default packages for language: en (English) ...
File exists: C:\Users\edwin victor\stanza_resources\en\default.zip
Finished downloading models and saved to C:\Users\edwin victor\stanza_resources
Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

Downloaded file to C:\Users\edwin victor\stanza_resources\resources.json
Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

Using device: cpu
Loading: tokenize
Loading: mwt
Loading: pos
Loading: lemma
Loading: constituency
Loading: depparse
Loading: sentiment
Loading: ner
Done loading processors!


word: This, upos: PRON
word: is, upos: AUX
word: a, upos: DET
word: test, upos: NOUN
word: sentence, upos: NOUN
word: ., upos: PUNCT


In [14]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
   ------------------ --------------------- 0.5/1.1 MB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 1.1/1.1 MB 1.8 MB/s eta 0:00:00
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
   ---------------------------------------- 0.0/586.9 kB ? eta -:--:--
   ---------------------------------------- 586.9/586.9 kB 3.0 MB/s eta 0:00:00
Installing collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.10.1
