In [1]:
import numpy as np
from nltk.tokenize import word_tokenize
import pandas as pd
import pickle
from tqdm import tqdm_notebook as tqdm
import nltk.translate.bleu_score as bleu
from difflib import SequenceMatcher

This notebook was used to explore BLEU scoring for the results of the OpenNMT models (all three of them).

# import data

In [2]:
all_df = pd.read_pickle('data/opennmt/rule_based_corrected_df.pkl')
all_df = all_df[all_df['Dataset'] == 'test'].drop("Dataset", axis=1).reset_index()
results_df = pd.read_pickle('data/nmt_results.pkl')[['Source', 'Prediction', 'Copy Prediction', 'No Entity Copy Prediction']]
df = pd.concat([all_df, results_df], axis=1, join_axes=[results_df.index]).drop("Source", axis=1)
df = df.drop("Category", axis=1)
df.head()

Unnamed: 0,index,Original,Target 0,Target 1,Target 2,Target 3,Prediction,Copy Prediction,No Entity Copy Prediction
0,0,I mean that you have to really be her friend.,And I mean Really be her friend.,Just be her BFF 4 real.,you have to be her friend.,"You have to actually be her friend, for real.",I mean you have to really be her friend .,I mean you have to really be her friend .,I mean you have to be her friend .
1,1,Are you posing a rhetorical question?,Sounds like a rhetorical question :),Do you really want an answer?,That sounds more like a rhetorical question th...,Are you asking me a rhetorical question?,What kind of question is that ?,What are you asking a question ?,What kind of question is that ?
2,2,Men pretend to love in order to have intercour...,"Men play at love to get sex, women play at sex...","Men fake love to get laid, women fake orgasms ...","Guys PRETEND to love so they can get laid, wom...",Dudes just act like they love a chick to get b...,"Men play love to have sex , women play for sex...","Men pretend to love in order to have sex , wom...","Men play to love in order to have sex , women ..."
3,3,I do not intend to be mean.,I don't want to be mean.,I wasn't trying to be a jerk.,I'm not tryin to be mean...,I didn't want to be mean,I do n't mean to be mean .,I do n't mean to be mean .,I do n't mean to be mean .
4,4,I would estimate an average of 45% initially b...,On average I'd say about 45% at first but than...,"It's a little less than 50/50 at the start, bu...",Prolly 45% at the start but when you get to no...,"I guess it'd be around 45% to start with, but ...","I would say a 15 % of 45 % , then once you get...","45 % of them , but once you get to know the pe...",45 % of 45 % and then after you know the perso...


In [46]:
for idx, row in df.sample(1).iterrows():
    print('Source:\t\t' + row['Original'])
    print('Target:\t\t' + row['Target 0'])
    print('Pred:\t\t' + str(row['Prediction']))
    print('Copy Pred:\t' + row['Copy Prediction'])
    print('No ENT Pred:\t' + row['No Entity Copy Prediction'])

Source:		Most importantly, be genuine.
Target:		Lol most important... just be yourself.
Pred:		But most of all be real .
Copy Pred:	But most of all , be real !
No ENT Pred:	Most of all , be true , be true !


In [49]:
for idx, row in df.iterrows():
    if 'Putin' in row['Prediction']:
        print('Source:\t\t' + row['Original'])
        print('Target:\t\t' + row['Target 0'])
        print('Pred:\t\t' + str(row['Prediction']))
        print('Copy Pred:\t' + row['Copy Prediction'])
        print('No ENT Pred:\t' + row['No Entity Copy Prediction'])
        print()

Source:		Determination is one of the gifts we can give ourselves.
Target:		Its one of the gifts we can always give ourselves.
Pred:		Putin is one of the gifts we can give us selves .
Copy Pred:	Determination and mabey one of the gifts we can give ourselves
No ENT Pred:	Like one of the gifts we can give ourselves .

Source:		When you are wither her, just be yourself.
Target:		Just be yourself when you are with her.
Pred:		Putin , just be your self
Copy Pred:	When you are typin ' her , just be yourself .
No ENT Pred:	When you are in her mind .

Source:		Continuously attempt to impress her.
Target:		Never stop trying to impress her .
Pred:		Putin ... try to impress her .
Copy Pred:	Or try to impress her .
No ENT Pred:	Continuously try to impress her .

Source:		Marraige is about trust and communication.
Target:		Marriage is really about trusting & communicating
Pred:		Putin is about trust and communication .
Copy Pred:	Is it about trust and communication
No ENT Pred:	Marraige and about tr

In [None]:
df.to_excel('data/opennmt/results.xlsx', index = None, engine='xlsxwriter')

# BLEU

Here, I confirmed that BLEU scoring is not an appropriate evaluation metric for this task. You can see below that BLEU scores become significantly better when all 4 target options are included, because more n-grams are covered between the four options. The crux of the problem is that as the data is all within the same language, the sentences are too similar to begin with for BLEU to meaningfully capture the similarities that matter.

In [21]:
test_tgt0 = list(all_df['Target 0'])
test_tgt1 = list(all_df['Target 1'])
test_tgt2 = list(all_df['Target 2'])
test_tgt3 = list(all_df['Target 3'])
assert(len(test_tgt0) == len(test_tgt1) == len(test_tgt2) == len(test_tgt3))

ref = []
for idx in range(len(test_tgt0)):
    ref.append([test_tgt0[idx], test_tgt1[idx], test_tgt2[idx], test_tgt3[idx]])
    
base_hyp = list(results_df['Prediction'])
copy_hyp = list(results_df['Copy Prediction'])
copynoent_hyp = list(results_df['No Entity Copy Prediction'])

In [24]:
print('BLEU scores using all 4 targets.')
#print('Base:\t' + str(bleu.corpus_bleu(ref, base_hyp)))
print('Copy:\t' + str(bleu.corpus_bleu(ref, copy_hyp)))
print('CopyNoEnt:\t' + str(bleu.corpus_bleu(ref, copynoent_hyp)))

BLEU scores using all 4 targets.
Copy:	0.7493827175758517
CopyNoEnt:	0.7524875434593201


In [9]:
print('BLEU scores using only one of 4 targets.')
print(bleu.corpus_bleu([[line] for line in test_tgt0], hyp))
print(bleu.corpus_bleu([[line] for line in test_tgt1], hyp))
print(bleu.corpus_bleu([[line] for line in test_tgt2], hyp))
print(bleu.corpus_bleu([[line] for line in test_tgt3], hyp))

BLEU scores using only one of 4 targets.
0.5238716701257187
0.4483937207801788
0.44976733715218586
0.45597632782422576


In [10]:
# how close are the predicted to the original?
test_orig = list(results_df['Source'])
print('BLEU score of source data to prediction')
print(bleu.corpus_bleu([[line] for line in test_orig], hyp))

BLEU score of source data to prediction
0.6206195843828992


In [11]:
print(bleu.corpus_bleu(ref, [line for line in test_orig]))

0.6995504029544203


# Similarity Scores 

In [12]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [14]:
similar("I mean that you have to really be her friend.", "And I mean Really be her friend.")

0.7012987012987013