# Notebook used to remove duplicate data, balance classes, and create train/validation splits

# Imports

In [1]:
import pandas as pd

# Load file

In [2]:
data = pd.read_csv('/content/train.En.csv')

# Cleanup: remove NaN tweet values and duplicate rows

In [3]:
data

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
3463,3463,The population spike in Chicago in 9 months is...,0,,,,,,,
3464,3464,You'd think in the second to last English clas...,0,,,,,,,
3465,3465,I’m finally surfacing after a holiday to Scotl...,0,,,,,,,
3466,3466,Couldn't be prouder today. Well done to every ...,0,,,,,,,


In [4]:
len(data)

3468

In [5]:
# drop rows where the 'tweet' value is NaN
data = data[data['tweet'].notna()]

In [6]:
# 1 row was removed
len(data)

3467

# There are duplicate tweets with the opposite labels (noise) in the data

In [8]:
# get all duplicate tweets in the dataset
duplicate_data = data[data.duplicated(subset=['tweet'], keep=False)]
duplicate_data

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
17,17,Whoever’s toddler ass sprayed the entire toile...,1,I hope whoever keeps peeing all over the bathr...,1.0,0.0,0.0,0.0,0.0,0.0
24,24,@AsdaServiceTeam imagine your delivery being 2...,1,It's not acceptable for you to just refund my ...,1.0,0.0,0.0,0.0,0.0,0.0
47,47,@AlexShawESPN @robwishart @GNev2 Does he order...,1,I don’t think he drank 11 beers every hour,1.0,0.0,0.0,0.0,0.0,1.0
338,338,Replace Pelosi #Nancy,1,He should not have been elected president and ...,1.0,0.0,0.0,0.0,0.0,0.0
648,648,if i saw a capybara in person id probably thro...,1,I think capybaras are scary,0.0,1.0,0.0,0.0,1.0,0.0
918,918,Whoever’s toddler ass sprayed the entire toile...,0,,,,,,,
927,927,@kevinabstract ye,0,,,,,,,
928,928,@kevinabstract ye,0,,,,,,,
939,939,@AsdaServiceTeam imagine your delivery being 2...,0,,,,,,,
940,940,@AsdaServiceTeam imagine your delivery being 2...,0,,,,,,,


In [9]:
# remove duplicates, retain the first instance; because all positives appear at top of dataframe, all positives will be retained.
data = data.drop_duplicates(subset=['tweet'], keep='first')

# re-check for duplicates, success if none
data[data.duplicated(subset=['tweet'], keep=False)]

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question


In [10]:
# get positive examples
positive = data[data['sarcastic'] == 1]
len(positive)

867

In [11]:
# sample negative examples equal to the number of positive examples 
negative = data[data['sarcastic'] == 0].sample(n=len(positive), random_state=10)
len(negative)

867

In [12]:
# concatenate positive and negative to get balanced data
balanced_data = pd.concat([positive, negative])

# Create five training and validation splits
Validation set is 1/10 size of training set

In [13]:
half_val_size = len(positive) // 10
print("Target num of validation examples for each class:", half_val_size)

Target num of validation examples for each class: 86


In [14]:
seed_list = [10, 21, 33, 48, 51]

In [15]:
def createSplit(seedvalue: int):
  # sample the target number of positive and negative validation examples
  val_positive = positive.sample(n=half_val_size, random_state=seedvalue)
  val_negative = negative.sample(n=half_val_size, random_state=seedvalue)

  # concantenate positive and negative validation examples
  validation = pd.concat([val_positive, val_negative])

  # shuffle validation data
  validation = validation.sample(frac=1, random_state=seedvalue)

  # training data are the examples that do not appear in validation set
  unused_balanced_data_indices = [index_value for index_value in balanced_data.index if index_value not in validation.index]
  training = balanced_data.loc[unused_balanced_data_indices]

  # shuffle training data
  training = training.sample(frac=1, random_state=seedvalue)

  # check for overlap between train and validation 
  result = pd.concat([training, validation]).drop_duplicates(subset=['tweet'], keep=False)
  print('Length of concatenated train and validation:', len(result))
  print('Length of balanced data:', len(balanced_data))

  return training, validation

In [16]:
# call function for each seed value
for seed_value in seed_list:
  training, validation = createSplit(seed_value)
  training.to_csv('balanced_train_En_seed' + str(seed_value) +'.csv')
  validation.to_csv('balanced_validation_En_seed' + str(seed_value) + '.csv')


Length of concatenated train and validation: 1734
Length of balanced data: 1734
Length of concatenated train and validation: 1734
Length of balanced data: 1734
Length of concatenated train and validation: 1734
Length of balanced data: 1734
Length of concatenated train and validation: 1734
Length of balanced data: 1734
Length of concatenated train and validation: 1734
Length of balanced data: 1734
