In [13]:
# libraries for data processing
import pandas as pd
import numpy as np

# libraries for stratified splitting the datasets
from sklearn.model_selection import train_test_split

# libraries for back translation
import time
from tqdm import tqdm
import textblob
from textblob.translate import NotTranslated

In [2]:
# home folder for the dataset
DATA_HOME = '/dgxhome/cra5302/MMHS'

# destination files for data
TRAINFILE = 'Train/train.csv'
TESTFILE = 'Test/test.csv'
UNLABELED = 'Train/unlabeled.csv'

In [3]:
# original train data
train_df = pd.read_csv(DATA_HOME + "/Train/trainAD_lbl.csv")

In [4]:
# number samples in the data
train_df.shape[0]

125503

Use back-translation technique to obtain paraphrsing samples of original text

The samples are translated as follows
> English -> French -> English

In [5]:
def back_translate(text):
    # without this line, textblob blocks further requests after sometime
    time.sleep(1) # wait for 500 ms (to avoid sending too much traffic to textblob)
    
    try:
        t_forward = textblob.blob.TextBlob(text)
        translated = str(t_forward.translate(to = "fr"))

        t_backward = textblob.blob.TextBlob(translated)
        back_translated = str(t_backward.translate(to = "en"))
        
        return back_translated
    
    except NotTranslated:
        return text

In [7]:
original_text = train_df["Text"].values

backtranslated_text = [back_translate(text) for text in tqdm(original_text)]
# backtranslated_text = [text for text in tqdm(original_text)]


100%|██████████| 125503/125503 [00:00<00:00, 2111105.15it/s]


In [8]:
train_df["bt_Text"] = backtranslated_text

In [9]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,Label,Text,bt_Text
0,0,1023940590382268417,0,@Eamon0303 @CNN Better than the retard in the ...,@Eamon0303 @CNN Better than the retard in the ...
1,1,1023940826882293760,5,@meloIigya When you’re slowly becoming retarde...,@meloIigya When you’re slowly becoming retarde...
2,2,1023940897346658307,0,Some of y’all just be throwing darts at SJW wo...,Some of y’all just be throwing darts at SJW wo...
3,3,1023942343202881536,0,Via: https://t.co/ikFHsOOttG Gracious latina M...,Via: https://t.co/ikFHsOOttG Gracious latina M...
4,4,1023943177319919616,5,Retard joke. :D https://t.co/gzz0viCBrd,Retard joke. :D https://t.co/gzz0viCBrd


> Split the samples into training and unlabeled samples

In [10]:
y_train_df = train_df["Label"].to_frame()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
        train_df, y_train_df,stratify=y_train_df, test_size=0.9)

In [14]:
values, counts = np.unique(y_train, return_counts=True)
values, counts

(array([0, 1, 2, 3, 4, 5]), array([10534,   950,   279,   309,    13,   465]))

In [16]:
X_train.to_csv(DATA_HOME + "/Train/train.csv")
X_test.to_csv(DATA_HOME + "/Train/unlabeled.csv")

In [21]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,Label,Text,bt_Text
102351,102351,1114500110514360322,0,"Yeah thought so, what a fucking dumbass. Retar...","Yeah thought so, what a fucking dumbass. Retar..."
65768,65768,1108265761296900096,3,fuck this fat faggot bitch https://t.co/9lCWnX...,fuck this fat faggot bitch https://t.co/9lCWnX...
76198,76198,1109267226471862272,0,@ZacheryMcwhert3 @ballistic108 @alpharadtv Are...,@ZacheryMcwhert3 @ballistic108 @alpharadtv Are...
3756,3756,1036316277441286144,0,"The ""future"" is here and it looks retarded. ht...","The ""future"" is here and it looks retarded. ht..."
31469,31469,1061330486805585921,0,@byronf1 R u retarded or what? https://t.co/N4...,@byronf1 R u retarded or what? https://t.co/N4...


In [18]:
X_test.shape

(112953, 5)

In [20]:
test_df = pd.read_csv(DATA_HOME + "/Test/testAD_lbl.csv")
original_text = test_df["Text"].values
# backtranslated_text = [back_translate(text) for text in tqdm(original_text)]
backtranslated_text = [text for text in tqdm(original_text)]
test_df["bt_Text"] = backtranslated_text
test_df.to_csv(DATA_HOME + "/Test/test.csv")


100%|██████████| 8411/8411 [00:00<00:00, 1501715.09it/s]
