In [1]:
!pip install transformers



In [11]:
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet

# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to get synonyms of a word
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").lower()
            if synonym != word:
                synonyms.append(synonym)
    return list(set(synonyms))

# Function to replace words with their synonyms
def synonym_replacement(sentence, num_replacements=2):
    words = sentence.split()
    if len(words) == 0:
        return sentence
    words_to_replace = random.sample(words, min(num_replacements, len(words)))

    for word in words_to_replace:
        synonyms = get_synonyms(word)
        if synonyms:
            synonym = random.choice(synonyms)
            sentence = sentence.replace(word, synonym, 1)

    return sentence

# Function to augment the entire dataset
def augment_dataset_with_synonyms(data, num_augmentations=5, num_replacements=2):
    augmented_texts = []

    for _, row in data.iterrows():
        sentence = row['corrected_tweet']
        label = row['class']
        augmented_texts.append({'corrected_tweet': sentence, 'class': label})

        # Generate augmented samples
        for _ in range(num_augmentations):
            augmented_sentence = synonym_replacement(sentence, num_replacements)
            augmented_texts.append({'corrected_tweet': augmented_sentence, 'class': label})

    return pd.DataFrame(augmented_texts)

# Load the dataset
dataset_path = '/content/labeled_data_cleaned_whole.csv'
data = pd.read_csv(dataset_path)

# Handle missing values
data['corrected_tweet'] = data['corrected_tweet'].fillna('')
data['corrected_tweet'] = data['corrected_tweet'].astype(str)

# Apply synonym replacement augmentation
augmented_data = augment_dataset_with_synonyms(data, num_augmentations=3, num_replacements=2)

# Save the augmented dataset
augmented_dataset_path = '/content/augmented_dataset_whole_synonyms.csv'
augmented_data.to_csv(augmented_dataset_path, index=False)
print(f"Augmented dataset saved to {augmented_dataset_path}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Augmented dataset saved to /content/augmented_dataset_whole_synonyms.csv
