# Load data

In [None]:
import pandas as pd

train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

print(train_df.head())
print(test_df.head())

print(train_df.shape)
print(test_df.shape)

                                              tweets       class
0  Be aware  dirty step to get money  #staylight ...  figurative
1  #sarcasm for #people who don't understand #diy...  figurative
2  @IminworkJeremy @medsingle #DailyMail readers ...  figurative
3  @wilw Why do I get the feeling you like games?...  figurative
4  -@TeacherArthurG @rweingarten You probably jus...  figurative
                                              tweets       class
0  no one ever predicted this was going to happen...  figurative
1  @Stooshie its as closely related as Andrews or...  figurative
2  I find it ironic when Vegans say they love foo...  figurative
3  Quick rt that throwing money vine I've not see...  figurative
4  yep, keep adding me to your #devops lists.... ...  figurative
(81408, 2)
(8128, 2)


# Handle null data

In [None]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

test_df.dropna(inplace=True)

print(test_df.isnull().sum())

tweets    0
class     0
dtype: int64
tweets    2
class     9
dtype: int64
tweets    0
class     0
dtype: int64


# Combine and check for class imbalance

In [None]:
#concatenate train and test data
df = pd.concat([train_df, test_df], axis=0)

In [None]:
import numpy as np
print(np.unique(list(df["class"])))
df['class'].value_counts()

['figurative' 'irony' 'regular' 'sarcasm']


figurative    23282
irony         23005
sarcasm       22786
regular       20454
Name: class, dtype: int64

we see there is slight imbalanced on the data i gonna augment it but first we gonna clean it

In [None]:
import numpy as np
print(np.unique(list(df["class"])))
print(df['class'].value_counts())
print(df.shape)
df = df.drop_duplicates(subset=['tweets'])
print(np.unique(list(df["class"])))
print(df['class'].value_counts())

['figurative' 'irony' 'regular' 'sarcasm']
figurative    23282
irony         23005
sarcasm       22786
regular       20454
Name: class, dtype: int64
(89527, 2)
['figurative' 'irony' 'regular' 'sarcasm']
figurative    22001
regular       20427
sarcasm       16955
irony         14008
Name: class, dtype: int64


# Clean the data by stopwords, and regex

In [None]:
import nltk
nltk.download('stopwords')
def clean(tweet): 
    import re
    import string
    import nltk
    
    # Special characters
    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)
    
    #emojis
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    tweet =  emoji_pattern.sub(r'', tweet)
    
    # usernames mentions like "@abc123"
    ment = re.compile(r"(@[A-Za-z0-9]+)")
    tweet =  ment.sub(r'', tweet)
    
    # Contractions
    tweet = re.sub(r"he's", "he is", tweet)
    tweet = re.sub(r"there's", "there is", tweet)
    tweet = re.sub(r"We're", "We are", tweet)
    tweet = re.sub(r"That's", "That is", tweet)
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"they're", "they are", tweet)
    tweet = re.sub(r"Can't", "Cannot", tweet)
    tweet = re.sub(r"wasn't", "was not", tweet)
    tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
    tweet = re.sub(r"aren't", "are not", tweet)
    tweet = re.sub(r"isn't", "is not", tweet)
    tweet = re.sub(r"What's", "What is", tweet)
    tweet = re.sub(r"haven't", "have not", tweet)
    tweet = re.sub(r"hasn't", "has not", tweet)
    tweet = re.sub(r"There's", "There is", tweet)
    tweet = re.sub(r"He's", "He is", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"You're", "You are", tweet)
    tweet = re.sub(r"I'M", "I am", tweet)
    tweet = re.sub(r"shouldn't", "should not", tweet)
    tweet = re.sub(r"wouldn't", "would not", tweet)
    tweet = re.sub(r"i'm", "I am", tweet)
    tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
    tweet = re.sub(r"I'm", "I am", tweet)
    tweet = re.sub(r"Isn't", "is not", tweet)
    tweet = re.sub(r"Here's", "Here is", tweet)
    tweet = re.sub(r"you've", "you have", tweet)
    tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
    tweet = re.sub(r"we're", "we are", tweet)
    tweet = re.sub(r"what's", "what is", tweet)
    tweet = re.sub(r"couldn't", "could not", tweet)
    tweet = re.sub(r"we've", "we have", tweet)
    tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
    tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
    tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
    tweet = re.sub(r"Here\x89Ûªs", "Here is", tweet)
    tweet = re.sub(r"who's", "who is", tweet)
    tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
    tweet = re.sub(r"y'all", "you all", tweet)
    tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
    tweet = re.sub(r"would've", "would have", tweet)
    tweet = re.sub(r"it'll", "it will", tweet)
    tweet = re.sub(r"we'll", "we will", tweet)
    tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
    tweet = re.sub(r"We've", "We have", tweet)
    tweet = re.sub(r"he'll", "he will", tweet)
    tweet = re.sub(r"Y'all", "You all", tweet)
    tweet = re.sub(r"Weren't", "Were not", tweet)
    tweet = re.sub(r"Didn't", "Did not", tweet)
    tweet = re.sub(r"they'll", "they will", tweet)
    tweet = re.sub(r"they'd", "they would", tweet)
    tweet = re.sub(r"DON'T", "DO NOT", tweet)
    tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
    tweet = re.sub(r"they've", "they have", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"should've", "should have", tweet)
    tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
    tweet = re.sub(r"where's", "where is", tweet)
    tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
    tweet = re.sub(r"we'd", "we would", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"weren't", "were not", tweet)
    tweet = re.sub(r"They're", "They are", tweet)
    tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
    tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
    tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
    tweet = re.sub(r"let's", "let us", tweet)
    tweet = re.sub(r"it's", "it is", tweet)
    tweet = re.sub(r"can't", "cannot", tweet)
    tweet = re.sub(r"don't", "do not", tweet)
    tweet = re.sub(r"you're", "you are", tweet)
    tweet = re.sub(r"i've", "I have", tweet)
    tweet = re.sub(r"that's", "that is", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"doesn't", "does not", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"didn't", "did not", tweet)
    tweet = re.sub(r"ain't", "am not", tweet)
    tweet = re.sub(r"you'll", "you will", tweet)
    tweet = re.sub(r"I've", "I have", tweet)
    tweet = re.sub(r"Don't", "do not", tweet)
    tweet = re.sub(r"I'll", "I will", tweet)
    tweet = re.sub(r"I'd", "I would", tweet)
    tweet = re.sub(r"Let's", "Let us", tweet)
    tweet = re.sub(r"you'd", "You would", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"Ain't", "am not", tweet)
    tweet = re.sub(r"Haven't", "Have not", tweet)
    tweet = re.sub(r"Could've", "Could have", tweet)
    tweet = re.sub(r"youve", "you have", tweet)  
    tweet = re.sub(r"donå«t", "do not", tweet)   
            
    # Character entity references
    tweet = re.sub(r"&amp;", "&", tweet)
    
    # html tags
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    tweet = re.sub(html, '', tweet)
    
    # Urls
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet)
    tweet = re.sub(r'https?://\S+|www\.\S+','', tweet)
        
    #Punctuations and special characters
    
    tweet = re.sub('[%s]' % re.escape(string.punctuation),'',tweet)
    
    tweet = tweet.lower()
    
    splits = tweet.split()
    splits = [word for word in splits if word not in set(nltk.corpus.stopwords.words('english'))]
    #tweet = ' '.join(word for word in text.split() if word not in stop_words)
    tweet = ' '.join(splits)
    
    return tweet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df['tweets']= df['tweets'].apply((lambda x: clean(x))) 
print("Cleaned")

Cleaned


# Encode the data to 0,1,2,3 for easier processing later down the line

In [None]:
from sklearn.preprocessing import LabelEncoder

# assuming your class column is named 'class'
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])


In [None]:
print(df.head)

<bound method NDFrame.head of                                                  tweets  class
0     aware dirty step get money staylight staywhite...      0
1               sarcasm people understand diy artattack      0
2     dailymail readers sensible always shocker sarc...      0
3                        get feeling like games sarcasm      0
4                        probably missed text sarcastic      0
...                                                 ...    ...
8123  yes totally submit photos shitty online magazi...      3
8124  test saturday thank uni sarcasm griffith unive...      3
8125             listening misery disconcerting sarcasm      3
8126                       go kind sarcasm standup4kids      3
8127       shocked refs tcu vs minn game big 12 sarcasm      3

[73391 rows x 2 columns]>


# Synonym augmentation

In [None]:
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('punkt')
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def replace_with_synonyms(text):
    words = nltk.word_tokenize(text)
    new_words=[]
    for word in words:
        synonyms = get_synonyms(word)
        if len(synonyms) > 0:
            new_words.append(synonyms[0])
        else:
            new_words.append(word)
    return " ".join(new_words)

augmented_data = []
for index, row in df.iterrows():
    text = row["tweets"]
    label = row["class"]
    augmented_text = replace_with_synonyms(text)
    augmented_data.append((augmented_text, label))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import nltk
from nltk.corpus import wordnet
import pandas as pd
import random

nltk.download('wordnet')
nltk.download('punkt')

# Step 1: Find the majority class
class_counts = df['class'].value_counts()
majority_class = class_counts.idxmax()
num_samples = class_counts[majority_class]

# Step 2: Find the minority classes
minority_classes = class_counts.index[class_counts < num_samples]

# Step 3: Calculate how many additional samples are needed for each minority class
additional_samples = {}
for c in minority_classes:
    diff = num_samples - class_counts[c]
    additional_samples[c] = diff

# Step 4: Apply the synonym augmentation to the texts in the minority classes
augmented_data = []
for c in minority_classes:
    class_df = df[df['class'] == c]
    num_augmented = additional_samples[c]
    for i in range(num_augmented):
        index = random.randint(0, len(class_df) - 1)
        text = class_df.iloc[index]['tweets']
        augmented_text = replace_with_synonyms(text)
        augmented_data.append((augmented_text, c))

# Step 5: Combine the original dataset with the augmented samples and shuffle the rows
augmented_df = pd.DataFrame(augmented_data, columns=['tweets', 'class'])
balanced_df = pd.concat([df, augmented_df], axis=0)
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Check for imbalance

In [None]:
print(np.unique(list(balanced_df["class"])))
print(balanced_df['class'].value_counts())
print(balanced_df.head)

[0 1 2 3]
0    22001
2    22001
3    22001
1    22001
Name: class, dtype: int64
<bound method NDFrame.head of                                                   tweets  class
0          deep insight clockmaking arrest story sarcasm      0
1      office move looming already made start plan ah...      0
2      subway created 5 dollar ft long anthem 5 years...      0
3      hello please sign share petition get mentalhea...      2
4      aust republican movement cofounder swears bear...      0
...                                                  ...    ...
87999  finally realize friends w drug addict rmr dadv...      2
88000                    oh love made huge bitch sarcasm      3
88001  rundown gop candidates 2015 september debate l...      2
88002  glad fairandbalanced debate hosted megynkelly ...      3
88003  ‘consumers fraudsters’ magically banks pasa an...      3

[88004 rows x 2 columns]>


# Done

In [None]:
balanced_df.to_csv("augmented_data_BALANCED.csv", index=False)
print("done")

done
