## Downloading a dataset

In [1]:
import requests
import zipfile
import io
import os

In [2]:
link = "https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip"
r = requests.get(link)

In [3]:
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [4]:
os.listdir()

['.ipynb_checkpoints',
 '1.0-initial-data-exploration.ipynb',
 '2.0-data-preprocessing.ipynb',
 '3.0-final-preparation-of-datasets.ipynb',
 'filtered.tsv']

## Reading the dataset

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv('filtered.tsv', sep='\t')

In [7]:
# we can see that it has extra column (the first one) that we need to remove
data.drop(columns=["Unnamed: 0"], inplace=True)

|Column name     |   Description |
| --- | --------- |
| reference|           original text|
|translation|         modified text(less toxic)|
|similarity|          cosine similarity of text(how similar they are)|
|lenght_diff|         relative length difference($\frac{\text{translation}-\text{ref}}{\text{ref}}$)|
| ref_tox|toxicity of reference|
|trn_tox|toxicifiy of translation|

## Preprocessing the dataset

### Text Cleaning

In [8]:
import re

def lower_text(text):
    return text.lower()

def remove_numbers(text):
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punc(text):
    text_nopunc = re.sub(r'[^a-z|\s]', ' ', text)
    return text_nopunc

def remove_multi_spaces(text):
    text_no_doublespaces = re.sub('\s+', ' ', text).strip()
    return text_no_doublespaces

### Tokenization

In [9]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

def tokenize_text(text):
    return word_tokenize(text)

def remove_stop_words(tokens):
    return [w for w in tokens if w not in stop_words]

def stem_words(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vlad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def preprocess(text):
    _lowered = lower_text(text)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punc(_without_numbers)
    _single_spaced = remove_multi_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
    _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_without_sw)
    
    return _stemmed

In [11]:
data['reference'] = data['reference'].apply(preprocess)
data['translation'] = data['translation'].apply(preprocess)

### Final Data Preporation

In [12]:
from sklearn.model_selection import train_test_split
test_ratio = 0.1
train_val, test = train_test_split(
    data, test_size=test_ratio, random_state=42)

In [13]:
val_ratio = 0.2
train, val = train_test_split(
    train_val, test_size=val_ratio, random_state=42
)

#### Creating DataLoaders

In [14]:
from itertools import chain
def yield_tokens(df):
    for _, sample in df.iterrows():
        yield list(chain.from_iterable(sample.to_list()[:2]))

In [15]:
!pip install torchtext


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -m


In [16]:
from torchtext.vocab import build_vocab_from_iterator

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']


vocab = build_vocab_from_iterator(yield_tokens(train), specials=special_symbols)

In [17]:
sample = train['reference'][356049]
print(sample)

['brain', 'noth']


In [18]:
encoded = vocab(sample)
print(encoded)

[131, 128]


In [19]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
class CustomTextDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        ref = torch.tensor(vocab(sample['reference']))
        trn = torch.tensor(vocab(sample['translation']))
        return ref, trn

In [21]:
def collate_batch(batch):
    ref_list, trn_list = [], []
    for ref, trn in batch:
        ref_list.append(torch.tensor(vocab(ref)))
        trn_list.append(torch.tensor(vocab(trn)))
    padded_refs = pad_sequence(ref_list, batch_first=True, padding_value=0)
    padded_trns = pad_sequence(trn_list, batch_first=True, padding_value=0)
    return torch.Tensor()

In [22]:
train_dataset = CustomTextDataset(train, vocab)
val_dataset = CustomTextDataset(val, vocab)
test_dataset = CustomTextDataset(test, vocab)

In [23]:
batch_size = 32

In [25]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_batch)


<h4> Now, having ready dataloaders we can proceed to developing the model.