### Mount Google drive

*  Mount Google drive in the directory '/content/drive'
*  Drive contains dataset files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Install the TextAttack

*  Install `textattack[tensorflow]` in order to use data augmentation capabilities of library
*  Upgrade numpy in order to avoid error

In [None]:
# Installing text Attack for the purpose of text augmentation
!pip3 install textattack[tensorflow]

In [None]:
# Updating numpy version to avoid exception in text Attack library
!pip install --upgrade numpy

In [None]:
!pip install conllu==4.4.1

### Imports

In [None]:
import pandas as pd
import csv
from collections import Counter
import re

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# import transformations, contraints, and the Augmenter
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import WordSwapQWERTY
from textattack.transformations import CompositeTransformation
from textattack.transformations import WordSwapChangeLocation
from textattack.transformations import WordSwapChangeName
from textattack.transformations import WordSwapChangeNumber
from textattack.transformations import WordSwapContract
from textattack.transformations import WordSwapWordNet
from textattack.transformations import WordSwapRandomCharacterSubstitution

from textattack.constraints.pre_transformation import RepeatModification
from textattack.constraints.pre_transformation import StopwordModification

from textattack.augmentation import Augmenter

### Pre-processing

* Read training dataset files
* Join datasets in a single DataFrame
* Lowercase text (optional)
* Replacing "\&amp;" for "&"
* Augment dataset
* Save augmented dataset

In [None]:
class_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/training_data/train_class.tsv"
tweet_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/training_data/train_tweets.tsv"

lowercase = False

# Read files
class_df = pd.read_csv(class_file, sep='\t', header=None)
tweet_df = pd.read_csv(tweet_file, sep='\t', quoting=csv.QUOTE_NONE, header=None)

d = {"tweet_id":class_df[0], "label":class_df[1], "text":tweet_df[1]}

df = pd.DataFrame(data = d)

for i in range(0, len(df)-1):
  if "&amp;" in df["text"][i]:
    df["text"][i] = df["text"][i].replace("&amp;", "&")
  if lowercase:
    df["text"][i] = df["text"][i].lower()


In [None]:
X = df.text
X

In [None]:
y = df.label
y

In [None]:
print("There are ", len(df[df["label"]=="ADE"]) , "positive examples (ADE) in this dataset.")
print("There are ", len(df[df["label"]=="noADE"]), "negative examples (NoADE) in this dataset.")

#### Augmentation

* Use following transformations:
  * Random character swap
  * Character swap by adjacent QWERTY keyboard characters
  * Perform contractions (For example: "I am"->"I'm")
  * Swap words by Word Net synonyms 

In [None]:
# Set up transformation using CompositeTransformation()
#transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapChangeLocation(), WordSwapChangeName(), WordSwapChangeNumber(), WordSwapContract(), WordSwapWordNet()])
# WordSwapRandomCharacterSubstitution - Transforms an input by replacing one character in a word with a random new character.
# WordSwapQWERTY - Swaps characters with QWERTY adjacent keys
# WordSwapWordNet - Transforms an input by replacing its words with synonyms provided by WordNet
# WordSwapContract - Transforms an input by performing contraction on recognized combinations
transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapWordNet(), WordSwapContract()])
# Set up constraints
constraints = [RepeatModification(), StopwordModification()]
# Create augmenter with specified parameters
augmenter = Augmenter(transformation=transformation, constraints=constraints, pct_words_to_swap=0.5, transformations_per_example=5)

print_var = 1
print_count = 0


i = 0
neg = 0
for index, row in df.iterrows():
  #print(row["label"])
  if(row["label"]=="ADE"):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      #if print_var == 1 and print_count < 5:
        #print("Original tweet:", text)
        #print("Generated tweets:")
        #for t in range(5):
        #  print(new_serie[t])
        #print("-----------------------------------")
        #print_count = print_count + 1
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series(["ADE"]*5)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      #print("Index error! Coulg not change tweet -> ", row["text"])
      neg = neg + 1
    
print(i, "tweets augmented.")
print("Could not augment ", neg, "ADE tweets.")

In [None]:
# +-27% positive labels
Counter(y)

In [None]:
# Save augmented dataset
with open("/content/drive/MyDrive/Dissertacao/Subtask_1a/augmented_training_data/augmented_training.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X)):
      tsv_writer.writerow([X[i], y[i]])



#### Random Oversampling

In [None]:
# Random Oversampler

oversampler = RandomOverSampler(sampling_strategy=0.3)

X_oversampled, y_oversampled = oversampler.fit_resample(X.to_numpy().reshape(-1,1), y)

print(len(X_oversampled))
Counter(y_oversampled)

In [None]:
# Save oversampled dataset
with open("/content/drive/MyDrive/Dissertacao/Subtask_1a/oversampled_training_data/oversampled_training.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X_oversampled)):
      tsv_writer.writerow([X_oversampled[i][0], y_oversampled[i]])

#### Random Undersampling

In [None]:
# Random Undersampler

oversampler = RandomUnderSampler(sampling_strategy=0.1)

X_undersampled, y_undersampled = oversampler.fit_resample(X.to_numpy().reshape(-1,1), y)

print(len(X_undersampled))
Counter(y_undersampled)

In [None]:
# Save undersampled dataset
with open("/content/drive/MyDrive/Dissertacao/Subtask_1a/undersampled_training_data/undersampled_training.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X_undersampled)):
      tsv_writer.writerow([X_undersampled[i][0], y_undersampled[i]])

#### Augmentation and oversampling sequencially

* Use following transformations:
  * Random character swap
  * Character swap by adjacent QWERTY keyboard characters
  * Perform contractions (For example: "I am"->"I'm")
  * Swap words by Word Net synonyms
* Random oversampling (strategy: 0.4)

In [None]:
# Set up transformation using CompositeTransformation()
#transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapChangeLocation(), WordSwapChangeName(), WordSwapChangeNumber(), WordSwapContract(), WordSwapWordNet()])
transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapWordNet(), WordSwapContract()])
# Set up constraints
constraints = [RepeatModification(), StopwordModification()]
# Create augmenter with specified parameters
augmenter = Augmenter(transformation=transformation, constraints=constraints, pct_words_to_swap=0.5, transformations_per_example=5)

i = 0
neg = 0
for index, row in df.iterrows():
  #print(row["label"])
  if(row["label"]=="ADE"):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series(["ADE"]*5)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      #print("Index error! Coulg not change tweet -> ", row["text"])
      neg = neg + 1
    
print(i, "tweets augmented.")
print("Could not augment ", neg, "ADE tweets.")

In [None]:
# +-27% positive labels
Counter(y)

In [None]:
# Random Oversampler

oversampler = RandomOverSampler(sampling_strategy=0.4)

X_oversampled, y_oversampled = oversampler.fit_resample(X.to_numpy().reshape(-1,1), y)

print(len(X_oversampled))
Counter(y_oversampled)

In [None]:
# Save oversampled dataset
with open("/content/drive/MyDrive/Dissertacao/Subtask_1a/augmented_oversampled_training_data/augmented_oversampled_training.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X_oversampled)):
      tsv_writer.writerow([X_oversampled[i][0], y_oversampled[i]])

### Merging dataset to single file

In [None]:
print(len(X))
Counter(y)

In [None]:
# Save oversampled dataset
with open("/content/drive/MyDrive/Dissertacao/Subtask_1a/training_data/merged_training_dataset.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label', 'start', 'end', 'span', 'med_id'])
    for i in range(len(X)):
      tsv_writer.writerow([X[i], y[i]])