In [7]:
import pandas as pd
import random
from nltk.corpus import wordnet
import re
import nltk

nltk.download('wordnet')

# Load the dataset
file_path = '/content/labeled_data.csv'
dataset = pd.read_csv(file_path)

# Drop unnecessary columns
dataset.drop(columns=['Unnamed: 0', 'count'], inplace=True)

# Function to clean tweet text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)    # Remove mentions
    text = re.sub(r'#\w+', '', text)    # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply text cleaning
dataset['tweet'] = dataset['tweet'].apply(clean_text)

# Data augmentation: Synonym Replacement
def replace_synonyms(text):
    words = text.split()
    augmented = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = random.choice(synonyms).lemmas()[0].name()
            augmented.append(synonym if synonym != word else word)
        else:
            augmented.append(word)
    return ' '.join(augmented)

dataset['tweet_synonym'] = dataset['tweet'].apply(replace_synonyms)

# Data augmentation: Paraphrased Text (Simplified)
def paraphrase_text(text):
    words = text.split()
    augmented = words[::-1]  # Reversing as a simple paraphrase example
    return ' '.join(augmented)

dataset['tweet_paraphrased'] = dataset['tweet'].apply(paraphrase_text)

# Data augmentation: Shuffled Words
def shuffle_words(text):
    words = text.split()
    random.shuffle(words)
    return ' '.join(words)

dataset['tweet_shuffled'] = dataset['tweet'].apply(shuffle_words)

# Save the cleaned and augmented dataset
cleaned_augmented_file_path = '/content/cleaned_augmented_labeled_data.csv'
dataset.to_csv(cleaned_augmented_file_path, index=False)

print(f"Cleaned and augmented dataset saved to: {cleaned_augmented_file_path}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned and augmented dataset saved to: /content/cleaned_augmented_labeled_data.csv


In [8]:
from google.colab import files
files.download("/content/cleaned_augmented_labeled_data.csv")  # Replace with your file name

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>