In [1]:
!pip install transformers



In [15]:
import random
import pandas as pd

# Function to randomly delete words from a sentence
def random_deletion(sentence, p=0.1):
    words = sentence.split()
    if len(words) == 1:
        return sentence

    # Keep words with probability 1 - p
    words = [word for word in words if random.uniform(0, 1) > p]
    return ' '.join(words) if words else sentence

# Function to randomly swap words in a sentence
def random_swap(sentence, n=2):
    words = sentence.split()
    if len(words) < 2:
        return sentence

    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# Function to randomly add a word to the sentence
def random_insertion(sentence, n=1):
    words = sentence.split()
    if not words:
        return sentence
    for _ in range(n):
        word_to_add = random.choice(words)
        idx = random.randint(0, len(words))
        words.insert(idx, word_to_add)
    return ' '.join(words)

# Function to apply noise injection augmentation
def noise_injection(sentence, deletion_prob=0.1, num_swaps=2, num_insertions=1):
    if not sentence.strip():
        return sentence
    # Apply random deletion
    sentence = random_deletion(sentence, p=deletion_prob)
    # Apply random swap
    sentence = random_swap(sentence, n=num_swaps)
    # Apply random insertion
    sentence = random_insertion(sentence, n=num_insertions)
    return sentence

# Function to augment the entire dataset with noise injection
def augment_dataset_with_noise(data, num_augmentations=5, deletion_prob=0.1, num_swaps=2, num_insertions=1):
    augmented_texts = []

    for _, row in data.iterrows():
        sentence = row['corrected_tweet']
        label = row['class']
        augmented_texts.append({'corrected_tweet': sentence, 'class': label})

        # Generate augmented samples
        for _ in range(num_augmentations):
            augmented_sentence = noise_injection(
                sentence,
                deletion_prob=deletion_prob,
                num_swaps=num_swaps,
                num_insertions=num_insertions
            )
            augmented_texts.append({'corrected_tweet': augmented_sentence, 'class': label})

    return pd.DataFrame(augmented_texts)

# Load the dataset
dataset_path = '/content/labeled_data_cleaned_whole.csv'
data = pd.read_csv(dataset_path)

# Handle missing values
data['corrected_tweet'] = data['corrected_tweet'].fillna('')
data['corrected_tweet'] = data['corrected_tweet'].astype(str)

# Apply noise injection augmentation
augmented_data = augment_dataset_with_noise(data, num_augmentations=3, deletion_prob=0.1, num_swaps=2, num_insertions=1)

# Save the augmented dataset
augmented_dataset_path = '/content/augmented_dataset_noise_injection.csv'
augmented_data.to_csv(augmented_dataset_path, index=False)
print(f"Augmented dataset saved to {augmented_dataset_path}")


Augmented dataset saved to /content/augmented_dataset_noise_injection.csv
