## Data division by train, val, test sets.

In [2]:
import pandas as pd
import numpy as np
import warnings
import torch

torch.manual_seed(420)
np.random.seed(420)
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_table("../data/raw/filtered.tsv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [13]:
sorted_by_ref_data = data.sort_values('ref_tox')
sorted_by_ref_data.head()

# First of all let me subtract middle part of sentences where toxicity is not determined properly

part1 = sorted_by_ref_data.iloc[:200_000]
part2 = sorted_by_ref_data.iloc[300_000:]

# and replace reference and toxicity for the first part
ref_temp = part1['reference'].copy()
ref_value = part1['ref_tox'].copy()

part1['reference'] = part1['translation']
part1['translation'] = ref_temp
part1['ref_tox'] = part1['trn_tox']
part1['trn_tox'] = ref_value

new_sorted = pd.concat([part1, part2], axis=0)

# and I take only rows with translation toxicity <0.3
new_sorted = new_sorted[new_sorted.trn_tox < 0.3]

new_sorted.to_csv('../data/internal/data.csv')

In [14]:
data = pd.read_csv('../data/internal/data.csv')
data = data.rename(columns={'Unnamed: 0': 'id'})

In [15]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2):
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

train, val, test= train_validate_test_split(data)

In [16]:
print(train.shape)
train.head()

(105262, 9)


Unnamed: 0,id,Unnamed: 0.1,id.1,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
118383,53475,391785,391785,"God damn it, man, do you have a time machine?","Wow! Bummer, dude. You have a time machine?",0.607161,0.043478,0.999267,0.00033
95742,426615,317846,317846,Haven't you finished that damn thing yet?,you're not done with this yet?,0.686981,0.261905,0.998955,3.7e-05
152285,463407,181540,181540,"You bastard, that was my daddy's desk!","you rascal, that was my father's desk.",0.749138,0.0,0.999565,0.041223
13737,22096,329514,329514,"they're filming the movie, God damn it.","All right, they're making a movie.",0.628424,0.125,0.996077,7.3e-05
69114,409027,135813,135813,"Damn it, you saw him coming.","hell, you saw him coming.",0.941843,0.103448,0.998398,0.010818


In [17]:
print(val.shape)
val.head()

(35087, 9)


Unnamed: 0,id,Unnamed: 0.1,id.1,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
126540,446484,352381,352381,You know what problem I have with your fucking...,you know what my problem is with your little p...,0.826222,0.265957,0.999347,0.070026
144722,458520,135663,135663,"You're a smart motherfucker, huh?","you're smart, aren't you?",0.76701,0.235294,0.99951,0.000146
16668,373962,142468,142468,Two fucking days and nights.,two days and two nights.,0.930261,0.137931,0.996257,4.4e-05
128696,40792,121118,121118,"okay, okay, just shut up!","OK, OK, just pipe down",0.628026,0.115385,0.999369,0.000168
75579,413355,208694,208694,"You go, and every second you're down there, I ...","you're going to go, and every second you spend...",0.66161,0.022901,0.998561,0.00269


In [18]:
print(test.shape)
test.head()

(35088, 9)


Unnamed: 0,id,Unnamed: 0.1,id.1,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
80847,416895,541102,541102,Do you think I've eaten that dick?,you think I ate him?,0.676477,0.4,0.99868,0.000167
56092,69038,145931,145931,I feel like I'm going to fuck with my foot in ...,I just really feel like I'm about to rattle th...,0.637634,0.148649,0.998026,0.000765
81136,417094,37525,37525,We wouldn't have to go back to stupidity forev...,"would we go back to ignorance forever? """,0.724614,0.196078,0.998686,0.002686
104924,176437,63850,63850,there's a fool in Yellowstone who's been right...,"I met this guy in Yellowstone, this crazy guy....",0.671008,0.300971,0.999093,0.02576
150881,164801,348737,348737,and then they send a fool like you to tell me ...,"And they send some yo-yo like you in here, to ...",0.662654,0.321429,0.999555,0.019775


In [19]:
train.to_csv("../data/internal/train.csv")
val.to_csv("../data/internal/validation.csv")
test.to_csv("../data/internal/test.csv")