In [65]:
import pandas as pd
from textattack.augmentation import WordNetAugmenter
import random
import re

In [51]:
def load_data(input_file, text_column, label_column):
    data = pd.read_csv(input_file)
    data = data.sample(1000, random_state=42)  # Sample 1000 rows
    texts = data[text_column].tolist()
    labels = data[label_column].tolist()
    return data, texts, labels

In [46]:
def calculate_augmentation_details(original_size, target_size):
    augmentation_factor = (target_size - original_size) // original_size  # Full rounds of augmentation
    remaining_rows = (target_size - original_size) % original_size  # Handle remainder
    return augmentation_factor, remaining_rows

In [62]:
def format_text(text):
    """Ensure proper spacing between words in the augmented text."""
    # Use regex to split text correctly and join with single spaces
    formatted_text = re.sub(r'([a-zA-Z0-9])([A-Z])', r'\1 \2', text)  # Space between words in camelCase
    formatted_text = " ".join(formatted_text.split())  # Remove extra spaces if any
    return formatted_text

In [63]:
def augment_texts(texts, labels, augmentation_factor, remaining_rows, augmenter):
    """Augment the texts using the selected augmenter."""
    augmented_texts = []
    augmented_labels = []
    
    # Augment each row a fixed number of times
    for text, label in zip(texts, labels):
        for _ in range(augmentation_factor):
            try:
                augmented_text = augmenter.augment(text)
                if isinstance(augmented_text, list):
                    augmented_text = " ".join(augmented_text)  # Join list of words into a single string
                augmented_text = format_text(augmented_text)  # Ensure proper spacing
                augmented_texts.append(augmented_text)
                augmented_labels.append(label)
            except Exception as e:
                print(f"Augmentation error: {e}")
    
    # Handle the remaining rows
    for i in range(remaining_rows):
        try:
            augmented_text = augmenter.augment(texts[i])
            if isinstance(augmented_text, list):
                augmented_text = " ".join(augmented_text)  # Join list of words into a single string
            augmented_text = format_text(augmented_text)  # Ensure proper spacing
            augmented_texts.append(augmented_text)
            augmented_labels.append(labels[i])
        except Exception as e:
            print(f"Augmentation error: {e}")
    
    return augmented_texts, augmented_labels

In [53]:
def save_augmented_data(final_data, output_file):
    """Save the augmented data to a CSV file."""
    final_data.to_csv(output_file, index=False)
    print(f"Augmented dataset saved to '{output_file}'. Total rows: {len(final_data)}")

In [59]:
def augment_dataset(input_file, text_column, label_column, output_file, target_size=1500):
    """Main function to augment dataset."""
    # Load data
    data, texts, labels = load_data(input_file, text_column, label_column)
    original_size = len(texts)
    augmentation_factor, remaining_rows = calculate_augmentation_details(original_size, target_size)
    
    # Initialize WordNet augmenter
    wordnet_augmenter = WordNetAugmenter()
    
    # Augment texts and labels
    augmented_texts, augmented_labels = augment_texts(texts, labels, augmentation_factor, remaining_rows, wordnet_augmenter)
    
    # Combine original and augmented data
    final_texts = texts + augmented_texts
    final_labels = labels + augmented_labels
    
    # Create final DataFrame with augmented texts and labels
    final_data = pd.DataFrame({text_column: final_texts, label_column: final_labels})
    
    # Save the final augmented dataset
    save_augmented_data(final_data, output_file)

In [67]:
augment_dataset(
    input_file=r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\cleaned_dataset\labeled_data_cleaned.csv",  # Path to your dataset
    text_column="corrected_tweet",  # Column to augment
    label_column="class",
    output_file=r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data.csv",  # Output file for augmented dataset
    target_size= 1500  # Desired size
)

[nltk_data] Downloading package omw-1.4 to C:\Users\edwin
[nltk_data]     victor\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Augmented dataset saved to 'D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data.csv'. Total rows: 1500


In [8]:
!pip install tensorflow_hub

Collecting tensorflow_hub
  Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl (30 kB)
Installing collected packages: tensorflow_hub
Successfully installed tensorflow_hub-0.16.1
