## Downloading a dataset

In [2]:
import requests
import zipfile
import io
import os

In [3]:
link = "https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip"
r = requests.get(link)

In [4]:
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [5]:
os.listdir()

['.ipynb_checkpoints',
 '1.0-initial-data-exploration.ipynb',
 '2.0-data-preprocessing.ipynb',
 'filtered.tsv']

## Reading the dataset

In [6]:
import pandas as pd

In [7]:
data = pd.read_csv('filtered.tsv', sep='\t')

In [8]:
# we can see that it has extra column (the first one) that we need to remove
data.drop(columns=["Unnamed: 0"], inplace=True)

|Column name     |   Description |
| --- | --------- |
| reference|           original text|
|translation|         modified text(less toxic)|
|similarity|          cosine similarity of text(how similar they are)|
|lenght_diff|         relative length difference($\frac{\text{translation}-\text{ref}}{\text{ref}}$)|
| ref_tox|toxicity of reference|
|trn_tox|toxicifiy of translation|

## Preprocessing the dataset

In [9]:
data.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


### Text Cleaning

In [10]:
import re

def lower_text(text):
    return text.lower()

def remove_numbers(text):
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punc(text):
    text_nopunc = re.sub(r'[^a-z|\s]', ' ', text)
    return text_nopunc

def remove_multi_spaces(text):
    text_no_doublespaces = re.sub('\s+', ' ', text).strip()
    return text_no_doublespaces

In [16]:
sample_text = data.reference[43]
print(f"Original text: \'{sample_text}\'")
clean_text = remove_multi_spaces(remove_punc(remove_numbers(lower_text(sample_text))))
print(f"Clean text: \'{clean_text}\'")

Original text: 'I swear to God, the best thing I ever did in my life was save that little son of a bitch'
Clean text: 'i swear to god the best thing i ever did in my life was save that little son of a bitch'


### Tokenization

In [17]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

def tokenize_text(text):
    return word_tokenize(text)

def remove_stop_words(tokens):
    return [w for w in tokens if w not in stop_words]

def stem_words(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vlad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
tokenized_text = stem_words(remove_stop_words(tokenize_text(clean_text)))

In [19]:
tokenized_text

['swear',
 'god',
 'best',
 'thing',
 'ever',
 'life',
 'save',
 'littl',
 'son',
 'bitch']

In [20]:
def preprocess(text):
    _lowered = lower_text(text)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punc(_without_numbers)
    _single_spaced = remove_multi_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
    _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_without_sw)
    
    return _stemmed

In [22]:
data['reference'] = data['reference'].apply(preprocess)
data['translation'] = data['translation'].apply(preprocess)