## Downloading a dataset

In [1]:
import requests
import zipfile
import io
import os

In [2]:
link = "https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip"
r = requests.get(link)

In [3]:
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [4]:
os.listdir()

['.ipynb_checkpoints',
 '1.0-initial-data-exploration.ipynb',
 '2.0-data-preprocessing.ipynb',
 '3.0-final-preparation-of-datasets.ipynb',
 'filtered.tsv']

## Reading the dataset

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv('filtered.tsv', sep='\t')

In [7]:
# we can see that it has extra column (the first one) that we need to remove
data.drop(columns=["Unnamed: 0"], inplace=True)

|Column name     |   Description |
| --- | --------- |
| reference|           original text|
|translation|         modified text(less toxic)|
|similarity|          cosine similarity of text(how similar they are)|
|lenght_diff|         relative length difference($\frac{\text{translation}-\text{ref}}{\text{ref}}$)|
| ref_tox|toxicity of reference|
|trn_tox|toxicifiy of translation|

## Preprocessing the dataset

### Text Cleaning

In [8]:
import re

def lower_text(text):
    return text.lower()

def remove_numbers(text):
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punc(text):
    text_nopunc = re.sub(r'[^a-z|\s]', ' ', text)
    return text_nopunc

def remove_multi_spaces(text):
    text_no_doublespaces = re.sub('\s+', ' ', text).strip()
    return text_no_doublespaces

### Tokenization

In [9]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

def tokenize_text(text):
    return word_tokenize(text)

def remove_stop_words(tokens):
    return [w for w in tokens if w not in stop_words]

def stem_words(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vlad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def preprocess(text):
    _lowered = lower_text(text)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punc(_without_numbers)
    _single_spaced = remove_multi_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
    _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_without_sw)
    
    return _stemmed

In [11]:
data['reference'] = data['reference'].apply(preprocess)
data['translation'] = data['translation'].apply(preprocess)

### Final Data Preporation

In [13]:
from sklearn.model_selection import train_test_split
test_ratio = 0.1
train_val, test = train_test_split(
    data, test_size=test_ratio, random_state=42)

In [14]:
val_ratio = 0.2
train, val = train_test_split(
    train_val, test_size=val_ratio, random_state=42
)

#### Creating DataLoaders

In [21]:
for _,row in train.iterrows():
    print(row.to_list()[:2])

[['crap'], ['talk']]
[['dalia', 'vacat', 'boyfriend', 'yacht', 'turk', 'caico', 'sound', 'excit', 'destin', 'class'], ['dali', 'vacat', 'boyfriend', 'yacht', 'turk', 'caico', 'sound', 'great', 'place', 'suck']]
[['shooter', 'collater', 'take'], ['shooter', 'line', 'get']]
[['pathet', 'part', 'gon', 'na', 'work'], ['sad', 'part', 'go', 'work']]
[['consid', 'possibl', 'reason', 'laid', 'last', 'decad', 'someth', 'routin', 'use', 'word', 'trim'], ['consid', 'possibl', 'reason', 'sulli', 'last', 'ten', 'year', 'fact', 'say', 'often', 'word', 'pussi']]
[['yeah', 'right', 'place', 'freak', 'holiday'], ['crazi', 'vacat', 'spot']]
[['beat', 'us', 'kill', 'us', 'care', 'send', 'us'], ['care', 'us', 'send', 'us']]
[['come', 'long', 'line', 'navi', 'men', 'got', 'real', 'good', 'memori'], ['marin', 'famili', 'damn', 'good', 'memori']]
[['damn', 'colonel', 'corps', 'hang', 'never', 'even', 'rescu'], ['colonel', 'hang', 'around', 'save']]
[['alway', 'bleed', 'wash'], ['still', 'fuck', 'wash']]
[['r

[['ambush', 'us', 'still', 'take'], ['ambush', 'us', 'still', 'shoot']]
[['stupid', 'think', 'driven', 'way', 'back', 'wasp', 'hill', 'road', 'probabl', 'wear', 'noth', 'jockey'], ['silli', 'think', 'drove', 'back', 'wasp', 'hill', 'road', 'probabl', 'pant']]
[['capabl', 'kill', 'die', 'equal', 'find', 'way', 'equal', 'life'], ['abl', 'kill', 'die', 'equal', 'find', 'way', 'equal', 'life']]
[['could', 'kill', 'us', 'barri'], ['could', 'kill', 'us', 'barri']]
[['must', 'great', 'poverti', 'part', 'peopl', 'risk', 'butt', 'dollar'], ['must', 'great', 'miseri', 'area', 'peopl', 'will', 'risk', 'ass']]
[['christ', 'get', 'rid', 'blackguard'], ['oh', 'god', 'sake', 'get', 'rid', 'bastard']]
[['oda', 'nobunaga', 'first', 'shinobi', 'thief'], ['former', 'ninja', 'odi', 'nobunaga', 'thief']]
[['fuck', 'told', 'ill'], ['told', 'sick']]
[['take', 'bloodi', 'money', 'even', 'introduc', 'friend'], ['live', 'money', 'introduc', 'friend']]
[['hush'], ['shut']]
[['kick', 'butt', 'last', 'two', 'day']

[['hell', 'dwell', 'ask'], ['hell', 'ask']]
[['asshol'], ['swine']]
[['look', 'asshol', 'felix', 'threat', 'entir', 'mission'], ['look', 'guy', 'felix', 'threat', 'mission']]
[['eye', 'call', 'liar', 'work'], ['eye', 'give', 'liar', 'work']]
[['one', 'shot', 'take', 'core'], ['wow', 'knock', 'first', 'shot']]
[['tell', 'still', 'put', 'chinamen', 'jail', 'spit', 'laundri'], ['hey', 'tell', 'still', 'lock', 'chines', 'spit', 'underwear']]
[['see', 'soon', 'lord', 'war'], ['see', 'soon', 'lord', 'terror']]
[['case', 'fatal', 'shoot', 'man', 'name', 'roberto', 'flore', 'love', 'husband', 'father'], ['case', 'kill', 'man', 'name', 'roberto', 'flore', 'love', 'husband', 'father']]
[['tell', 'snart', 'hell', 'pyro', 'friend', 'gon', 'na', 'without', 'fanci', 'weapon'], ['tell', 'snart', 'proud', 'gun', 'without', 'fanci', 'gun']]
[['take', 'underwear', 'head'], ['take', 'slug', 'head']]
[['hid', 'basement', 'robert', 'tyson', 'rip', 'head'], ['hid', 'basement', 'robert', 'tyson', 'chase']]
[

[['continu', 'serv', 'even', 'though', 'master'], ['continu', 'serv', 'even', 'though', 'taller', 'vampir']]
[['never', 'trust', 'weirdo'], ['never', 'trust', 'stranger']]
[['tomorrow', 'walk', 'like', 'proud', 'faggot'], ['tomorrow', 'march', 'around', 'citi', 'proudli', 'like', 'real', 'man']]
[['thor', 'name', 'go'], ['hell', 'go']]
[['bitch', 'week'], ['hold', 'week']]
[['think', 'amaz'], ['think', 'hook']]
[['think', 'littl', 'semen', 'scream', 'eeeeee'], ['think', 'sperm', 'scream', 'eeeeee']]
[['schopenhau', 'got', 'noth', 'shit'], ['got', 'noth', 'schopenhau']]
[['fuck', 'dad', 'shop'], ['get', 'father', 'busi']]
[['take', 'silli', 'test', 'fill', 'form'], ['test', 'fill', 'form']]
[['whether', 'fire', 'quit', 'die'], ['bet', 'fire', 'gone', 'die']]
[['either', 'burn', 'hat', 'fire', 'use', 'blowtorch'], ['either', 'throw', 'cap', 'fire', 'use', 'burner']]
[['usual', 'like', 'warn', 'violat', 'demon', 'tongu'], ['usual', 'like', 'warn', 'get', 'rape', 'demon', 'tongu']]
[['hook

[['next', 'one', 'die'], ['die']]
[['come', 'back', 'pussi'], ['come', 'come', 'back', 'kitti']]
[['find', 'guilti', 'counselor', 'guilti', 'betray', 'fellow', 'man', 'guilti', 'betray', 'countri', 'guilti', 'abrog', 'oath', 'guilti', 'ofjudg', 'sell'], ['look', 'advocaci', 'guilti', 'betray', 'flimsi', 'guilti', 'betray', 'countri', 'guilti', 'violat', 'case', 'guilti', 'kill', 'sell']]
[['gay', 'put', 'us', 'across', 'road', 'uncl', 'jock', 'place'], ['gay', 'man', 'put', 'us', 'path', 'uncl', 'hous']]
[['know', 'mind', 'everyth', 'told', 'us', 'far', 'true'], ['crazi', 'know', 'crazi', 'told', 'us', 'true']]
[['let', 'robin', 'live', 'disgust', 'thing', 'think'], ['let', 'robin', 'live', 'heinou', 'thing', 'think']]
[['hypnosi', 'absurd', 'obsess'], ['hypnosi', 'absurd', 'obsess']]
[['shit', 'got', 'key'], ['got', 'key']]
[['viciou', 'boxer'], ['boxer', 'worst']]
[['saw', 'guy', 'class', 'get', 'two', 'inch', 'needl', 'stuck', 'arm', 'instructor', 'hypnosi', 'horseshit'], ['boy', 'c

[['said', 'liar'], ['lie']]
[['dear', 'suicid', 'club'], ['dear', 'dear', 'suicid', 'club']]
[['proceed', 'cours', 'farthest', 'remov', 'incom', 'regul', 'maintain', 'secreci', 'much', 'possibl'], ['start', 'whore', 'far', 'possibl', 'away', 'incom', 'regul', 'tri', 'keep', 'secret']]
[['anyth', 'foolish', 'henri'], ['make', 'play', 'henri']]
[['come', 'come'], ['come', 'damn']]
[['yeon', 'du', 'parent', 'yuri', 'min', 'su', 'parent', 'mental', 'disabl'], ['yeon', 'du', 'parent', 'yuri', 'min', 'su', 'parent', 'mental', 'ill']]
[['mind'], ['fool']]
[['caus'], ['dead']]
[['shit', 'take', 'money', 'go'], ['take', 'money', 'get']]
[['even', 'play', 'littl', 'goddamn'], ['play', 'littl', 'bit']]
[['shoot', 'first', 'think', 'later'], ['first', 'shoot', 'think']]
[['alreadi', 'kill', 'first', 'five', 'worri'], ['neutral', 'first', 'five', 'worri']]
[['bodi', 'fsh', 'regul', 'reproduct', 'function', 'women', 'stimul', 'product', 'egg', 'men', 'stimul', 'product', 'sperm', 'testicl'], ['fsh',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[['sex'], ['sleep']]
[['held', 'ring', 'flower', 'gem', 'band', 'time', 'blood', 'stain', 'stone'], ['put', 'ring', 'gemston', 'ring', 'time', 'bloodi', 'stone']]
[['ask', 'play', 'alongsid', 'luca', 'uh', 'ass', 'crack', 'father', 'assist', 'coach'], ['ask', 'play', 'alongsid', 'luca', 'surviv', 'assist']]
[['wotan', 'butcher', 'hous', 'merove', 'recognis', 'death', 'assur'], ['wotan', 'murder', 'hous', 'merovinu', 'recogn', 'death', 'would', 'seal']]
[['damn', 'herb', 'mani', 'time', 'told', 'use', 'guest', 'towel'], ['mani', 'time', 'told', 'use', 'guest', 'towel']]
[['follow', 'damn', 'car'], ['follow', 'car']]
[['gay', 'lad', 'ira'], ['fag', 'ira']]
[['ball', 'recept', 'banquet', 'sort', 'thing'], ['ball', 'know', 'receptionist', 'banquet', 'like']]
[['take', 'love', 'hold', 'ador', 'breast'], ['take', 'love', 'take', 'love', 'bosom']]
[['went', 'hispan', 'guy'], ['crazi', 'hispan', 'boy']]
[['mayb', 'throw', 'littl', 'ceremoni', 'shag', 'bit', 'bit', 'confetti', 'nice', 'littl', 

KeyboardInterrupt: 

In [27]:
from itertools import chain
def yield_tokens(df):
    for _, sample in df.iterrows():
        yield list(chain.from_iterable(sample.to_list()[:2]))

In [24]:
!pip install torchtext

Collecting torchtext
  Obtaining dependency information for torchtext from https://files.pythonhosted.org/packages/fe/12/8e13cda45a66c4c06e110bf91372085feab8d89f12c367930063d320a11d/torchtext-0.16.0-cp310-cp310-win_amd64.whl.metadata
  Downloading torchtext-0.16.0-cp310-cp310-win_amd64.whl.metadata (7.5 kB)
Collecting torch==2.1.0 (from torchtext)
  Obtaining dependency information for torch==2.1.0 from https://files.pythonhosted.org/packages/fa/47/1a7daf04f40715fc1cdc6f1cc3200228a556d06c843e6ceb58883b745e1b/torch-2.1.0-cp310-cp310-win_amd64.whl.metadata
  Downloading torch-2.1.0-cp310-cp310-win_amd64.whl.metadata (24 kB)
Collecting torchdata==0.7.0 (from torchtext)
  Obtaining dependency information for torchdata==0.7.0 from https://files.pythonhosted.org/packages/00/6a/ba927e61bf25991f352a7e0b943ed1d2461f97d8f86531567c47dbe0d99f/torchdata-0.7.0-cp310-cp310-win_amd64.whl.metadata
  Downloading torchdata-0.7.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Downloading torchtext-0.16.0-cp31

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.0.0 requires torch==2.0.0, but you have torch 2.1.0 which is incompatible.
torchvision 0.15.0 requires torch==2.0.0, but you have torch 2.1.0 which is incompatible.


In [28]:
from torchtext.vocab import build_vocab_from_iterator

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']


vocab = build_vocab_from_iterator(yield_tokens(train), specials=special_symbols)

In [40]:
sample = train['reference'][356049]
print(sample)

['brain', 'noth']


In [42]:
encoded = vocab(sample)
print(encoded)

[131, 128]


In [43]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:
train.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
365268,[crap],[talk],0.65793,0.176471,0.999375,5.2e-05
263691,"[dalia, vacat, boyfriend, yacht, turk, caico, ...","[dali, vacat, boyfriend, yacht, turk, caico, s...",0.814529,0.172414,0.000288,0.992684
93104,"[shooter, collater, take]","[shooter, line, get]",0.679317,0.02439,0.886835,0.040617
155348,"[pathet, part, gon, na, work]","[sad, part, go, work]",0.778638,0.040816,0.979934,4.1e-05
102660,"[consid, possibl, reason, laid, last, decad, s...","[consid, possibl, reason, sulli, last, ten, ye...",0.619683,0.074074,5.4e-05,0.992573


In [None]:
def collate_batch(batch):
    ref_list, trn_list, sim_list, len_diff_list, ref_tox_list, trn_tox_list =
    [], [], [], [], [], []
    for ref, trn, sim, len_diff, ref_tox, trn_tox in batch:
        ref_list.append(torch.tensor(vocab(ref)))
        trn_list.append(torch.tensor(vocab(trn)))
        sim_list.append(sim)
        len_diff.append(len_diff)
        ref_tox_list.append(ref_tox)
        trn_tox_list.append(trn_tox)
    padded_refs = pad_sequence(ref_list, batch_first=True, padding_value=0)
    padded_trns = pad_sequence(trn_list, batch_first=True, padding_value=0)
    return torch.Tensor()