### Analyze Evaluated Backtranslation Dataset

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [9]:
# ruen dataset
ruen_train = pd.read_csv('../../data/backtranslation/ruen_train_eval.txt', sep='\t')
ruen_dev = pd.read_csv('../../data/backtranslation/ruen_valid_eval.txt', sep='\t')
ruen_test = pd.read_csv('../../data/backtranslation/ruen_test_eval.txt', sep='\t')
ruen = pd.concat([ruen_train, ruen_dev, ruen_test])

In [10]:
# fren dataset
fren_train = pd.read_csv('../../data/backtranslation/fren_train_eval.txt', sep='\t')
fren_dev = pd.read_csv('../../data/backtranslation/fren_valid_eval.txt', sep='\t')
fren_test = pd.read_csv('../../data/backtranslation/fren_test_eval.txt', sep='\t')
fren = pd.concat([fren_train, fren_dev, fren_test])

In [11]:
# esen dataset
esen_train = pd.read_csv('../../data/backtranslation/esen_train_eval.txt', sep='\t')
esen_dev = pd.read_csv('../../data/backtranslation/esen_valid_eval.txt', sep='\t')
esen_test = pd.read_csv('../../data/backtranslation/esen_test_eval.txt', sep='\t')
esen = pd.concat([esen_train, esen_dev, esen_test])

##### Calculate Similarity Metrics

In [19]:
fren.sem_similarity.mean()

0.8579955021719157

In [18]:
ruen.sem_similarity.mean()

0.7984542970358184

In [20]:
esen.sem_similarity.mean()

0.8713989429817864

##### Calculate STA Metrics

In [22]:
ground_truth = np.zeros(len(ruen), dtype=int)
ground_truth

array([0, 0, 0, ..., 0, 0, 0])

In [27]:
accuracy_score(ground_truth, fren.preds.to_numpy()), len(fren[fren.preds == 0])

(0.06864073455320381, 10436)

In [28]:
accuracy_score(ground_truth, ruen.preds.to_numpy()), len(ruen[ruen.preds == 0])

(0.06809481840066299, 10353)

In [29]:
accuracy_score(ground_truth, esen.preds.to_numpy()), len(esen[esen.preds == 0])

(0.03660926873544772, 5566)

### Infer on generated dataset

In [None]:
model = load_checkpoint('../model/ft-robertoxic-classifier.pth', model=model)

Load checkpoint


In [None]:
train_bt = pd.read_csv('../data/backtranslation/ruen_train.txt', sep='\t', index_col=0)
dev_bt = pd.read_csv('../data/backtranslation/ruen_valid.txt', sep='\t', index_col=0)
test_bt = pd.read_csv('../data/backtranslation/ruen_test.txt', sep='\t', index_col=0)

In [None]:
train_bt.rename(columns={'translated': 'backtranslate', 'original': 'translate'}, inplace=True)
train_bt = train_bt[['source', 'translate', 'backtranslate']]

dev_bt.rename(columns={'translated': 'backtranslate', 'original': 'translate'}, inplace=True)
dev_bt = dev_bt[['source', 'translate', 'backtranslate']]

test_bt.rename(columns={'translated': 'backtranslate', 'original': 'translate'}, inplace=True)
test_bt = test_bt[['source', 'translate', 'backtranslate']]

In [None]:
bt_toxics = DatasetDict({
    'train': Dataset.from_pandas(train_bt[['backtranslate']]),
    'validation': Dataset.from_pandas(dev_bt[['backtranslate']]),
    'test': Dataset.from_pandas(test_bt[['backtranslate']])
})

In [None]:
def tokenize_data(batch):
    comment = batch['backtranslate']
    
    tokenized = tokenizer(
        comment,
        truncation=True,
        max_length=128,
        padding='max_length'
    )

    return tokenized

In [None]:
bt_toxics_tokenized = bt_toxics.map(tokenize_data, batched=True)
bt_toxics_tokenized

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['backtranslate', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['backtranslate', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 2455
    })
    test: Dataset({
        features: ['backtranslate', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 3693
    })
})

In [None]:
bt_toxics_tokenized.set_format('torch')

In [None]:
trainbtloader = DataLoader(bt_toxics_tokenized['train'], shuffle=False, batch_size=BATCH_SIZE)
devbtloader = DataLoader(bt_toxics_tokenized['validation'], shuffle=False, batch_size=BATCH_SIZE)
testbtloader = DataLoader(bt_toxics_tokenized['test'], shuffle=False, batch_size=BATCH_SIZE)

In [None]:
trainpreds = []

for batch in tqdm(trainbtloader):
    batch = {k: v.to(DEVICE) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
    out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    
    logits = out.logits.squeeze().detach().cpu()
    probs = torch.sigmoid(logits)
    preds = torch.argmax(probs, dim=1).numpy()
    
    
    trainpreds.extend(preds.tolist())

  0%|          | 0/782 [00:00<?, ?it/s]

In [None]:
devpreds = []

for batch in tqdm(devbtloader):
    batch = {k: v.to(DEVICE) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
    out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    
    logits = out.logits.squeeze().detach().cpu()
    probs = torch.sigmoid(logits)
    preds = torch.argmax(probs, dim=1).numpy()
    
    devpreds.extend(preds.tolist())

  0%|          | 0/39 [00:00<?, ?it/s]

In [None]:
testpreds = []

for batch in tqdm(testbtloader):
    batch = {k: v.to(DEVICE) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
    out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    
    logits = out.logits.squeeze().detach().cpu()
    probs = torch.sigmoid(logits)
    preds = torch.argmax(probs, dim=1).numpy()
    
    testpreds.extend(preds.tolist())

  0%|          | 0/58 [00:00<?, ?it/s]

In [None]:
train_bt['bt_preds'] = trainpreds
dev_bt['bt_preds'] = devpreds
test_bt['bt_preds'] = testpreds

In [None]:
train_bt.head()

Unnamed: 0,source,translate,backtranslate,bt_preds
0,take your admin powers and stick them up your ...,Бери свои административные способности и засун...,Take your administrative skills and shove them...,1
1,you just made a fool of yourself .,Ты только что выставил себя дураком.,You just made a fool of yourself.,1
2,this animal killer is paying his share .,Этот зверь-убийца платит свою долю.,This killer animal is paying its share.,1
3,so you voted for the loser ?,"Значит, вы проголосовали за неудачника?",So you voted for the loser?,1
4,", but we would not expect a bias lib troll lik...",", но мы бы не ожидали, что такой пристрастный ...",but we wouldn't expect a biased troll like you...,1


In [None]:
def cosine_similarity(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(vocabulary=tokenizer.vocab)
train_trans_vec = tfidf.fit_transform(train_bt['backtranslate']).toarray()
train_source_vec = tfidf.fit_transform(train_bt['source']).toarray()

dev_trans_vec = tfidf.fit_transform(dev_bt['backtranslate']).toarray()
dev_source_vec = tfidf.fit_transform(dev_bt['source']).toarray()

test_trans_vec = tfidf.fit_transform(test_bt['backtranslate']).toarray()
test_source_vec = tfidf.fit_transform(test_bt['source']).toarray()

  "Upper case characters found in"
  "Upper case characters found in"
  "Upper case characters found in"
  "Upper case characters found in"
  "Upper case characters found in"
  "Upper case characters found in"


In [None]:
train_sim = []
dev_sim = []
test_sim = []

for trans, src in tqdm(zip(train_trans_vec, train_source_vec)):
    train_sim.append(cosine_similarity(trans, src))
    
for trans, src in tqdm(zip(dev_trans_vec, dev_source_vec)):
    dev_sim.append(cosine_similarity(trans, src))
    
for trans, src in tqdm(zip(test_trans_vec, test_source_vec)):
    test_sim.append(cosine_similarity(trans, src))

0it [00:00, ?it/s]

  


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
train_bt['similarity'] = train_sim
dev_bt['similarity'] = dev_sim
test_bt['similarity'] = test_sim

In [None]:
train_bt_selected = train_bt[(train_bt.bt_preds == 0) & (train_bt.similarity <= .725)]
dev_bt_selected = dev_bt[(dev_bt.bt_preds == 0) & (dev_bt.similarity <= .725)]
test_bt_selected = test_bt[(test_bt.bt_preds == 0) & (test_bt.similarity <= .725)]

In [None]:
df_paraphrase_ref = pd.concat([train_bt_selected, dev_bt_selected, test_bt_selected])
df_paraphrase_ref.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2081 entries, 40 to 3624
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   source         2081 non-null   object 
 1   translate      2081 non-null   object 
 2   backtranslate  2081 non-null   object 
 3   bt_preds       2081 non-null   int64  
 4   similarity     2081 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 97.5+ KB


In [None]:
df_paraphrase_ref.to_csv('../data/paraphrase/paraphrase_ref.csv', sep='\t', index=False, header=True)

In [None]:
train_bt.to_csv('../data/paraphrase/train_prep_paraphrase.txt', sep='\t', index=False, header=True)
dev_bt.to_csv('../data/paraphrase/valid_prep_paraphrase.txt', sep='\t', index=False, header=True)
test_bt.to_csv('../data/paraphrase/test_prep_paraphrase.txt', sep='\t', index=False, header=True)

##### Calculate semantic similarity

In [None]:
stmodel = SentenceTransformer('all-MiniLM-L6-v2')
stmodel = stmodel.to(DEVICE)

In [None]:
df_paraphrase_ref = pd.read_csv('../data/paraphrase/paraphrase_ref.csv', sep='\t')
train_bt = pd.read_csv('../data/paraphrase/train_prep_paraphrase.txt', sep='\t')
dev_bt = pd.read_csv('../data/paraphrase/valid_prep_paraphrase.txt', sep='\t')
test_bt = pd.read_csv('../data/paraphrase/test_prep_paraphrase.txt', sep='\t')

In [None]:
class SemanticDataset(Dataset):
    def __init__(self, path, base_dir='../data/paraphrase/'):
        
        self.data_list = []
        
        self.path = os.path.join(base_dir, path)
        
        data = pd.read_csv(self.path, sep='\t')
        for row in tqdm(data.iterrows()):
            self.data_list.append({
                'source': row[1].source,
                'backtrans': row[1].backtranslate
            })
            
    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, item):
        return self.data_list[item]

In [None]:
trainbt_dataset = SemanticDataset('train_prep_paraphrase.txt')
devbt_dataset = SemanticDataset('valid_prep_paraphrase.txt')
testbt_dataset = SemanticDataset('test_prep_paraphrase.txt')

trainbt_loader = DataLoader(trainbt_dataset, batch_size=BATCH_SIZE, shuffle=False)
devbt_loader = DataLoader(devbt_dataset, batch_size=BATCH_SIZE, shuffle=False)
testbt_loader = DataLoader(testbt_dataset, batch_size=BATCH_SIZE, shuffle=False)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
train_stsim = []

for batch in tqdm(trainbt_loader):
    source = batch['source']
    backtrans = batch['backtrans']
    src_embed = stmodel.encode(source, convert_to_tensor=True)
    bts_embed = stmodel.encode(backtrans, convert_to_tensor=True)
    scores = util.cos_sim(src_embed, bts_embed)
    
    for i in range(len(source)):
        train_stsim.append(scores[i][i].item())
        
    torch.cuda.empty_cache()
    del src_embed
    del bts_embed

  0%|          | 0/782 [00:00<?, ?it/s]

In [None]:
dev_stsim = []

for batch in tqdm(devbt_loader):
    source = batch['source']
    backtrans = batch['backtrans']
    src_embed = stmodel.encode(source, convert_to_tensor=True)
    bts_embed = stmodel.encode(backtrans, convert_to_tensor=True)
    scores = util.cos_sim(src_embed, bts_embed)
    
    for i in range(len(source)):
        dev_stsim.append(scores[i][i].item())
        
    torch.cuda.empty_cache()
    del src_embed
    del bts_embed

  0%|          | 0/39 [00:00<?, ?it/s]

In [None]:
test_stsim = []

for batch in tqdm(testbt_loader):
    source = batch['source']
    backtrans = batch['backtrans']
    src_embed = stmodel.encode(source, convert_to_tensor=True)
    bts_embed = stmodel.encode(backtrans, convert_to_tensor=True)
    scores = util.cos_sim(src_embed, bts_embed)
    
    for i in range(len(source)):
        test_stsim.append(scores[i][i].item())
        
    torch.cuda.empty_cache()
    del src_embed
    del bts_embed

  0%|          | 0/58 [00:00<?, ?it/s]

In [None]:
train_bt['sem_similarity'] = train_stsim
dev_bt['sem_similarity'] = dev_stsim
test_bt['sem_similarity'] = test_stsim

In [None]:
train_bt.head()

Unnamed: 0,source,translate,backtranslate,bt_preds,similarity,sem_similarity
0,take your admin powers and stick them up your ...,Бери свои административные способности и засун...,Take your administrative skills and shove them...,1,0.654044,0.797628
1,you just made a fool of yourself .,Ты только что выставил себя дураком.,You just made a fool of yourself.,1,0.999817,1.0
2,this animal killer is paying his share .,Этот зверь-убийца платит свою долю.,This killer animal is paying its share.,1,0.9032,0.950419
3,so you voted for the loser ?,"Значит, вы проголосовали за неудачника?",So you voted for the loser?,1,0.999822,1.0
4,", but we would not expect a bias lib troll lik...",", но мы бы не ожидали, что такой пристрастный ...",but we wouldn't expect a biased troll like you...,1,0.425269,0.828127


In [None]:
train_bt_selected = train_bt[(train_bt.bt_preds == 0) & (train_bt.sem_similarity < 0.48)]
dev_bt_selected = dev_bt[(dev_bt.bt_preds == 0)]
test_bt_selected = test_bt[(test_bt.bt_preds == 0)]

In [None]:
df_paraphrase_ref = pd.concat([train_bt_selected, dev_bt_selected, test_bt_selected])
df_paraphrase_ref.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3233 entries, 6 to 3624
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   source          3233 non-null   object 
 1   translate       3233 non-null   object 
 2   backtranslate   3233 non-null   object 
 3   bt_preds        3233 non-null   int64  
 4   similarity      3206 non-null   float64
 5   sem_similarity  3233 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 176.8+ KB


In [None]:
df_paraphrase_ref[df_paraphrase_ref['similarity'].isnull()]

Unnamed: 0,source,translate,backtranslate,bt_preds,similarity,sem_similarity
633,obama has defecated on israel repeatedly .,Обама неоднократно обезвоживала Израэля.,Obama dehydrated Israel repeatedly.,0,,0.71621
830,bla bla bla you can spin yourself stupid all y...,"Бла-бла-бла, ты можешь крутить себя глупо, ско...","Blah, blah, blah, blah, blah, blah, blah, blah...",0,,0.244442
1216,waah screams the butthurt commie loser !,"Ваа кричит задница, музыка-неудачник!","Whoa, whoa, whoa, whoa, whoa, whoa, whoa, whoa...",0,,0.228463
1744,ive had it up to here with idiots who want poo...,.................................................,.................................................,0,,0.138859
6501,"paul martin , stephen harper , brian mulroney ...","Пол Мартин, Стефен Харпер, Брайан Мулрони (он ...","Paul Martin, Stephen Harper, Brian Mulroni.",0,,0.813203
6745,muslim ban rejected by DIGIT ororfederal court...,)................................................,(i.e. ),0,,0.066378
8084,"recycled , retread , useless lifers .","Рециркуляция, восстановление протектора, беспо...","Recirculation, retreading, useless life-saving...",0,,0.617553
10520,slang a man who is unpleasant or not to be tru...,:................................................,:...............................,0,,0.097186
11426,"ulysses simpson grant , the drunkard !","Улисс Симпсон Грант, пьяница!","Ulysses Simpson Grant, drunk!",0,,0.933305
11873,i dont refute unhinged blather .,Я не опровергаю бессердечный блейтер.,I'm not denying a heartless blather.,0,,0.646244
