In [2]:
import spacy
nlp = spacy.load("en_core_web_sm", exclude=['tagger', 'parser', 'ner', 'lemmatizer', 'textcat', 'custom'])

import numpy as np
from tqdm.auto import tqdm

from difflib import SequenceMatcher
from collections import Counter

def get_tokens(doc):
    all_tokens = []
    for token in doc:
        all_tokens.append(token.text)
        if len(token.whitespace_):
            all_tokens.append(token.whitespace_)
    return all_tokens

def make_changes(nlp, source_sentence, target_sentences = [], min_count = 2, debug=False):

    source_tokens = get_tokens(nlp(str(source_sentence)))
    
    target_docs_tokens = [get_tokens(nlp(str(sent))) for sent in target_sentences]
    all_actions = []

        
    for i in range(len(target_sentences)):

        target_tokens = target_docs_tokens[i]
        
        matcher = SequenceMatcher(None, source_tokens, target_tokens)
        
        raw_diffs = list(matcher.get_opcodes())
        
        
        for diff in raw_diffs:
            if diff[0] == 'replace':
                #"source_start_token", "source_end_token", "target_part"
                all_actions.append(
                    ('replace', diff[1], diff[2], "".join(target_tokens[diff[3] : diff[4]])) 
                )
            if diff[0] == 'delete':
                #"source_start_token", "source_end_token"
                all_actions.append(
                    ('delete', diff[1], diff[2])
                )
            if diff[0] == 'insert':
                #"source_start_token", "target_part"
                all_actions.append(
                    ('insert', diff[1], "".join(target_tokens[diff[3] : diff[4]]))
                )
     
    
    good_actions = [k for k,v in Counter(all_actions).items() if v >= min_count]
    good_actions.sort(key=lambda x: x[1]) #sort by second field - start token
    
    if debug:
        print("All actions", all_actions)
        print("Good actions", good_actions)
    
    if len(good_actions) > 0:
        
        final_text = ""
        current_start = 0
        previous_end =  0
        
        for action in good_actions:
            current_start = action[1]
            final_text += "".join(source_tokens[previous_end : current_start])
            if action[0] == 'replace':
                final_text += action[3]
                previous_end = action[2]
            if action[0] == 'delete':
                previous_end = action[2]
            if action[0] == 'insert':
                final_text += action[2]
                previous_end = action[1]
        
        final_text += "".join(source_tokens[previous_end :])
        return final_text
            
    else:
        return ''.join(source_tokens)

In [3]:
import os
import glob
def read_lines(fn):
    if not os.path.exists(fn):
        return []
    with open(fn, 'r', encoding='utf-8') as f:
        text = f.read()
    lines = text.split("\n")
    if lines[-1] == '':
        return lines[:-1]
    else:
        return lines

def write_lines(fn, lines, mode='w'):
    text_to_write = "\n".join(list(lines)) 
    with open(fn, encoding='utf-8', mode=mode) as f:
        f.write(text_to_write)

In [4]:
source_path = "../data_parallel/wi+locness/dev_src"
source_sentences = read_lines(source_path)

In [5]:
folder_path = "../predicts/"
#sorted(os.listdir(path))

In [96]:
#sorted(os.listdir(folder_path))

In [6]:
# predicts = [
#     'Exp_037_roberta_large_st3_epoch_1.txt',
#     'Exp_038_deberta_large_st3_epoch_0.txt',
#     'Exp_039_xlnet_large_st3_epoch_2.txt'
# ]

# predicts = [
#     'Exp_043_roberta_large_ac_0.05_mep_0.6.txt',
#     'Exp_044_deberta_large_ac_0.4_mep_0.55.txt',
#     'Exp_045_xlnet_large_ac_0.3_mep_0.6.txt'
# ]

predicts = [
    'Exp_043_roberta_large_ac_0.2_mep_0.6.txt',
    'Exp_044_deberta_large_ac_0.2_mep_0.6.txt',
    'Exp_045_xlnet_large_ac_0.3_mep_0.55.txt'
]

pred_texts = []
for pred_path in predicts:
    path = folder_path+pred_path
    pred_texts.append(read_lines(path))

In [7]:
pred_texts = np.array(pred_texts)
sent_after_merge = []
for i in tqdm(range(len(source_sentences))):
    source_sentence = source_sentences[i]
    target_sentences = pred_texts[:,i]
    new_sentence = make_changes(nlp, source_sentence, target_sentences = target_sentences, min_count = 2, debug=False)
    sent_after_merge.append(new_sentence)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4384.0), HTML(value='')))




In [8]:
pred_texts = [str(t) for t in sent_after_merge]

In [9]:
len(pred_texts)

4384

In [10]:
write_lines(folder_path+"pred_roberta_deberta_xlnet_after_tweeks_ac_023_mep_0655.txt", pred_texts, mode='w')

In [11]:
output_part = "pred_roberta_deberta_xlnet_after_tweeks_ac_023_mep_0655"
print("errant_parallel -orig data_parallel/wi+locness/dev_src -cor predicts/"+output_part+".txt -out evaluation/"+output_part+".m2")
print()
print("errant_compare -hyp evaluation/"+output_part+".m2 -ref data_m2/wi+locness/ABCN.dev.gold.bea19.m2")

errant_parallel -orig data_parallel/wi+locness/dev_src -cor predicts/pred_roberta_deberta_xlnet_after_tweeks_ac_023_mep_0655.txt -out evaluation/pred_roberta_deberta_xlnet_after_tweeks_ac_023_mep_0655.m2

errant_compare -hyp evaluation/pred_roberta_deberta_xlnet_after_tweeks_ac_023_mep_0655.m2 -ref data_m2/wi+locness/ABCN.dev.gold.bea19.m2


In [104]:
def generate_merge(predicts):
    pred_texts = []
    for pred_path in predicts[:3]:
        path = folder_path+pred_path
        pred_texts.append(read_lines(path))
    pred_texts = np.array(pred_texts)
    sent_after_merge = []
    for i in tqdm(range(len(source_sentences))):
        source_sentence = source_sentences[i]
        target_sentences = pred_texts[:,i]
        new_sentence = make_changes(nlp, source_sentence, target_sentences = target_sentences, min_count = 2, debug=False)
        sent_after_merge.append(new_sentence)
    pred_texts = [str(t) for t in sent_after_merge]
    write_lines(folder_path+predicts[3], pred_texts, mode='w')

In [106]:
def process_batch(combos):
    for combo in combos:
        generate_merge(combo)

In [105]:
from multiprocessing import Pool, cpu_count

In [110]:
chunks = np.array_split(all_comb, 10)

In [None]:
# pool = Pool(10)
# result_map = pool.map(process_batch, chunks)
# pool.close()
# pool.join()

### Try find best combination after tweeks

In [42]:
all_preds = os.listdir("../predicts/")

In [49]:
xlnet_preds = sorted([pred for pred in all_preds if "Exp_045" in pred])
roberta_preds = sorted([pred for pred in all_preds if "Exp_043" in pred])
deberta_preds = sorted([pred for pred in all_preds if "Exp_044" in pred])

In [99]:
additional_confidences = [round(0.05*i,3) for i in range(3,11)]
additional_confidences

[0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

In [100]:
min_error_probabilities = [round(0.05*i,3) for i in range(6,16)]
min_error_probabilities

[0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75]

In [101]:
len(additional_confidences)*len(min_error_probabilities)

80

In [103]:
all_comb = []
for ac in additional_confidences:
    for mep in min_error_probabilities:
        s_ac = str(ac)
        s_mep = str(mep)
        comb =  [
            'Exp_043_roberta_large_ac_'+s_ac+'_mep_'+s_mep+'.txt',
            'Exp_044_deberta_large_ac_'+s_ac+'_mep_'+s_mep+'.txt',
            'Exp_045_xlnet_large_ac_'+s_ac+'_mep_'+s_mep+'.txt',
            'Exp_merge_large_ac_'+s_ac+'_mep_'+s_mep+'.txt'
        ]
        all_comb.append(comb)

In [97]:
#all_comb