In [None]:
import pandas as pd
import numpy as np

## Process predicted post-edit text as outputted by t5

In [None]:
t5_generated = "data/new_pol/ampol_generator_output_test.txt" # post-edit text predictions generated by generator model or e2e model
line_id_file = 'data/new_pol/ampol_test_LIDS.txt'   # file to keep track of line_ids, output by Make_jsonl.ipynb
outfile = 'data/new_pol/ampol_bleu_input.txt'  # temporary file used below

In [None]:
newlines = []

with open(t5_generated, 'r') as t5f:
    for lidx, line in enumerate(t5f.readlines()):
        line = line.strip()
        newlines += [line]
    lcount = lidx
    
with open(line_id_file, 'r') as lidf:
    for lidx, line in enumerate(lidf.readlines()):
        line = line.strip()
        newlines[lidx] = newlines[lidx] + '\t' + line + '\n'
    rcount = lidx
    
assert lcount == rcount

with open(outfile, 'w') as outf:
    outf.write('decoded_sentence\tline_id\n')
    for line in newlines:
        outf.write(line)

## Compute metrics

In [None]:
input_filename = "data/new_pol/ampol_test.tsv"
decoded_filename = 'data/new_pol/ampol_bleu_input.txt' # input is taken as output file from above
outfilename = 'data/new_pol/ampol_bleu.txt' # output file

In [None]:
import numpy as np
import random
# from https://stackoverflow.com/questions/70623867/can-every-solution-of-the-edit-distance-problem-be-shown-on-the-matrix-obtained
def get_min_edit_count(text1, text2, get_edited_words=False):
    
    word1 = text1.strip().split()
    word2 = text2.strip().split()
    # we are doing lists of words rather than characters

    word1 = ['BEGIN'] + word1     #
    word2 = ['BEGIN'] + word2     # add a space before original words

    len_w1 = len(word1)     #
    len_w2 = len(word2)     # calculate the lengths of new words

    old_edit_matrix = np.zeros((len_w2, len_w1), dtype = int)
    edit_matrix = []
    for i in range(len_w2):
        edit_matrix.append([[0, []]] * len_w1)
    #edit_matrix = np.array(edit_matrix)
    
    # create a matrix with all zeros
    
    edit_matrix[0][0] = [0, []]
    
    for col_idx in range(len_w1):
        edit_matrix[0][col_idx] = [col_idx, [('DELETE', word1[j+1]) for j in range(col_idx)]]
        
    for row_idx in range(len_w2):
        edit_matrix[row_idx][0] = [row_idx, [('INSERT', word2[i+1]) for i in range(row_idx)]]

    old_edit_matrix[0, :] = range(len_w1)  
    # assign numbers from 0 to len_w1 in the first row of the edit_matrix 
    old_edit_matrix[:, 0] = range(len_w2)
    # assign numbers from 0 to len_w2 in the first column of the edit_matrix 

    for i in range(1, len_w2):
        for j in range(1, len_w1):
            #print j

            temp1 = edit_matrix[i-1][j][0] + 1
            temp2 = edit_matrix[i][j-1][0] + 1
            # add 1 to edit_matrix[i-1][j] and edit_matrix[i][j-1]
            
            path1 = edit_matrix[i-1][j][1] + [('INSERT', word2[i])]
            path2 = edit_matrix[i][j-1][1] + [('DELETE', word1[j])]

            
            temp3 = edit_matrix[i-1][j-1][0]
            if word1[j] != word2[i]:
                temp3 += 1
                action = ('REPLACE', (word1[j], word2[i]))
            else:
                action = ('SKIP', word1[j])
            path3 = edit_matrix[i-1][j-1][1] + [action]
            # if last characters are same don't add 1 to edit_matrix[i-1][j-1].
            # no need to replace

            edit_count = min(temp1, temp2, temp3)
            # find min between three numbers

            path_options = []
            for temp, path in [(temp1,path1), (temp2,path2), (temp3,path3)]:
                if temp == edit_count:
                    path_options += [path]

            edit_matrix[i][j] = [edit_count, random.choice(path_options)]
    
    if get_edited_words:
        return edit_matrix[-1][-1]
    return edit_matrix[-1][-1][0]
    

In [None]:
df_inputs = pd.read_csv(input_filename, sep='\t')
df_decoded = pd.read_csv(decoded_filename, sep='\t')

In [None]:
df_decoded

In [None]:
df_inputs = df_inputs.set_index('line_id')
df_decoded = df_decoded.set_index('line_id')

In [None]:
df_inputs.head()

In [None]:
df_decoded.head()

In [None]:
#df = df_decoded.join(df_inputs)
df = df_decoded.join(df_inputs,lsuffix='b_')
df = df.fillna('')

In [None]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu

scores = np.zeros(df.shape[0])
df['Score'] = scores

with open(outfilename,'w') as outfile:
    outfile.write('line_id\tarticle_title\tuser\trevision_number\tdecoded\trevision_text\tpre-edit\tbleu_score\tpre_len\tpred_len\tpost_len\n')
    for index, row in df.iterrows():
        candidate = str(row['decoded_sentence']).strip().split()
        reference = str(row['revision text']).strip().split()
        
        pre_edit = row['parent text'].strip().split()

        try:
            score = sentence_bleu([reference], candidate, (0.25, 0.25, 0.25, 0.25))
        except:
            score = np.nan
            
        outfile.write(str(index)+'\t'
                      +row['article_title']+'\t'
                      +str(row['user'])+'\t'
                      +str(row['revision_number'])+'\t'
                      +str(row['decoded_sentence'])+'\t'
                      +str(row['revision text'])+'\t'
                      +str(row['parent text'])+'\t'
                      +str(score)+'\t'
                      +str(len(pre_edit))+'\t'
                      +str(len(candidate))+'\t'
                      +str(len(reference))+'\n')

In [None]:
# average bleu scores

In [None]:
infilename = outfilename # read file we just outputted as dataframe
df_bleus = pd.read_csv(infilename, sep='\t')
df_bleus = df_bleus.set_index('line_id')
df_bleus = df_bleus.fillna('')

### Note that bleu_score is 4-gram bleu across all words, is NOT the metric from the paper, and is NOT an appropriate metric for our dataset. See further below for the metric used in our paper.

In [None]:
# NOTE: this is bleu score is not a good metric 
df_bleus['bleu_score'].mean()

In [None]:
beb_bleu_weights=(1.0, 0.0, 0.0, 0.0)
def compute_CUBE(in_text, out_text, pred_text):
    trueED = get_min_edit_count(in_text, out_text, get_edited_words=True)
    predED = get_min_edit_count(in_text, pred_text, get_edited_words=True)
    
    trueWordsAdded = [word for (edit_type, word) in trueED[1] if edit_type in ['INSERT']]
    predWordsAdded = [word for (edit_type, word) in predED[1] if edit_type in ['INSERT']]
    
    trueWordsRemoved = [word for (edit_type, word) in trueED[1] if edit_type in ['DELETE']]
    predWordsRemoved = [word for (edit_type, word) in predED[1] if edit_type in ['DELETE']]

    trueWordsAdded += [word[1] for (edit_type, word) in trueED[1] if edit_type in ['REPLACE']]
    predWordsAdded += [word[1] for (edit_type, word) in predED[1] if edit_type in ['REPLACE']]
    trueWordsRemoved += [word[0] for (edit_type, word) in trueED[1] if edit_type in ['REPLACE']]
    predWordsRemoved += [word[0] for (edit_type, word) in predED[1] if edit_type in ['REPLACE']]
    
    addscore = sentence_bleu([trueWordsAdded], predWordsAdded, weights=beb_bleu_weights, auto_reweigh=True)
    remscore = sentence_bleu([trueWordsRemoved], predWordsRemoved, weights=beb_bleu_weights, auto_reweigh=True)
    score = (addscore * len(trueWordsAdded) + remscore * len(trueWordsRemoved))/float(len(trueWordsAdded) + len(trueWordsRemoved))
    return score, addscore, remscore, len(trueWordsAdded), len(trueWordsRemoved)

In [None]:
df_bleus['CUBE'] = [0.0]*len(df_bleus.index)
df_bleus['CUBE_ins'] = [0.0]*len(df_bleus.index)
df_bleus['CUBE_del'] = [0.0]*len(df_bleus.index)
df_bleus['words_added'] = [0.0]*len(df_bleus.index)
df_bleus['words_removed'] = [0.0]*len(df_bleus.index)
counter = 0
for ridx, row in df_bleus.iterrows():
    counter += 1
    if counter % 100 == 0:
        print(str(counter) + ' of ' + str(len(df_bleus.index)))
    edit_bleu = compute_CUBE(row['pre-edit'], row['revision_text'], row['decoded'])
    
    df_bleus.at[ridx, 'CUBE'] = better_edit_bleu[0]
    df_bleus.at[ridx, 'CUBE_ins'] = better_edit_bleu[1]
    df_bleus.at[ridx, 'CUBE_del'] = better_edit_bleu[2]
    
    df_bleus.at[ridx, 'words_added'] = better_edit_bleu[3]
    df_bleus.at[ridx, 'words_removed'] = better_edit_bleu[4]

In [None]:
print(df_bleus['CUBE'].mean())

In [None]:
print(df_bleus['CUBE_ins'][df_bleus['words_added'] > 0].mean())

In [None]:
print(df_bleus['CUBE_del'][df_bleus['words_removed'] > 0].mean())

In [None]:
newbleuout = infilename
df_bleus.to_csv(newbleuout, sep='\t') # update same tsv file with CUBE metrics