In [261]:
BASE_DIR = '' # Working dir
DATA_DIR = f'{BASE_DIR}data/'
MODELS_DIR = f'{BASE_DIR}models/'
RESULTS_DIR = f'{BASE_DIR}results'

MODEL_NAME = 'l1-aware-t5'
AGG_FOLDER = f'{RESULTS_DIR}{MODEL_NAME}/aggregated/'
CACHE_DIR = f'{BASE_DIR}cache/'
ALIGNMENT_FOLDER = f'{CACHE_DIR}/alignments/'

In [3]:
import json
from tqdm import tqdm 
from collections import defaultdict

In [4]:
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
import torch

device = torch.device('cuda:0')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Model for processing 

In [5]:
import math

def compare_improve(sentence,num_outputs=1):
    print("BEFORE:", sentence)
    outputs = generate(model, IMPROVE_TOKEN + sentence, num_outputs)
    print("\nAFTER :\n-", '\n- '.join([sent.strip() for sent in outputs]))
    
    return outputs

def generate_batch(model, sentences, num_outputs=1, length=100):
    encodings = tokenizer(sentences, return_tensors='pt', padding=True).to(device=device)
    with torch.no_grad():
        outputs = model.generate(encodings['input_ids'], 
                                max_length=length, num_beams=5, 
                                num_return_sequences=num_outputs,
                                early_stopping=False)    
        return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
def generate(model, start, num_outputs=1, length=100):
    inputs_ids = tokenizer.encode(start, return_tensors='pt').to(device=device)
    with torch.no_grad():
        outputs = model.generate(inputs_ids, 
                                max_length=length, num_beams=5, 
                                num_return_sequences=num_outputs,
                                early_stopping=False)    
        return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            
def process_sentences(model, sentences, batch_size=8):
    parsed_sentences = []
    
    def divide_chunks(l, n):       
        for i in range(0, len(l), n):  
            yield l[i:i + n] 
    
    total = math.ceil(len(sentences)/batch_size)
    for batch in tqdm(divide_chunks(sentences, batch_size), total=total):
        
        batch_sents = [IMPROVE_TOKEN+item for item in batch]
        
        outputs = generate_batch(model, batch_sents, 1)
    
        parsed_sentences.extend(outputs)
        
    return parsed_sentences

In [6]:
import os
from collections import defaultdict
from nltk import word_tokenize

def save_split_files(data, base_output, tokenize=True):
    
    export_data = defaultdict(list)
    for item in data:
        for key in item:
            if tokenize:
                export_data[key].append(' '.join(word_tokenize(item[key])))
            else:
                export_data[key].append(item[key])
    
    for key in export_data:
        if os.path.exists(f'{base_output}-{key}.txt'):
            continue
        with open(f'{base_output}-{key}.txt', 'w', encoding='utf8') as f:
            f.write('\n'.join(export_data[key]))                            

In [7]:
def get_multi_tokens(files, languages=['es', 'de', 'fr', 'pt']):
    multi_lingual_token_files = []

    for language in languages:
        for file in files:
            base_file = file.split('.')[0]
            multi_lingual_token_files.append(f'{base_file}-token-{language}.json') 
    return multi_lingual_token_files

In [8]:
import os 
from os import listdir

def save_multi_files(files, base_folder):
    for file in files:
        base_file = file.split('.')[0]
        with open(f'{base_folder}/{file}','r') as f:
            data_sentences = json.loads(f.read())
        
        save_split_files([
            {
                'orig': item['orig'],
                'improved': item['improved'][0]
            }
            for item in data_sentences]
            , f'{base_folder}/{base_file}',tokenize=False)    
    
def split_models_files(models, files):   
    for model_version in models:
        base_folder = f'{RESULTS_DIR}{MODEL_NAME}/{model_version}/processed'
        save_multi_files(files, base_folder)
        
def find_files_and_split(models):    
    for model_version in models:
        base_folder = f'{RESULTS_DIR}{MODEL_NAME}/{model_version}/processed'
        files = [file for file in listdir(base_folder) if '.json' in file]
        split_models_files([model_version], files)

In [9]:
models_list = [ 
    't5sm-l1aware-multi-s260-v1',
    't5lg-l1aware-multi-s260-v1',
]

find_files_and_split(models_list)

# Grammaticality

In [225]:
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

In [226]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
robert_cola = RobertaForSequenceClassification.from_pretrained(MODELS_DIR+'roberta-cola-v1')

In [227]:
import pytorch_lightning as pl
import torch.nn.functional as F
from pytorch_lightning.metrics import functional as FM
from pytorch_lightning import loggers as pl_loggers

class LMClassifierInference(pl.LightningModule):
    
    def __init__(self, model, tokenizer, labels):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.labels = labels        
    
    def forward(self, x):
        self.model.eval()
        self.model.cuda()
        
        input_ids = tokenizer.encode(x, return_tensors='pt').to('cuda')
        outputs = self.model(input_ids)
        prob = F.softmax(outputs.logits.detach(), dim=1).cpu().numpy()[0].tolist()
        
        return {label: prob[index]  for index, label in enumerate(self.labels)}[self.labels[1]]

In [228]:
cola_model = LMClassifierInference(robert_cola, tokenizer, ['wrong', 'correct'])

In [234]:
def process_grammaticality(file, limit=None, detokenize=False, output='grammar', input_dir=MODEL_VERSION_RESULTS_DIR_PROCESSED, output_dir=MODEL_VERSION_RESULTS_DIR_METRICS, metric_version=None):
    process_file_metric(file, cola_model, output, limit=limit, detokenize=detokenize, input_dir=input_dir, output_dir=output_dir, metric_version=metric_version)    

# SLOR

In [20]:
import math
from nltk import word_tokenize, sent_tokenize
import pickle

In [21]:
from tqdm import tqdm
from functools import reduce

class UnigramModel():
    def __init__(self, tokenizer):
        self.unigram_model = defaultdict(int)
        self.word_count = 0
        self.vocab_count = 0
        self.tokenizer = tokenizer
    
    def tokenize_subwords(self, sent):
        return [item.replace('Ġ', '') for item in self.tokenizer.tokenize(sent)]

    def build(self, files, limit=None):
        for file in tqdm(files[:limit]):
            content = read_file(file)
            for sent in sent_tokenize(content):
                if len(sent) > 1024:
                    continue
                for word in self.tokenize_subwords(sent):
                    token = word.lower()
                    self.unigram_model[token] += 1
           
        self.word_count = sum(self.unigram_model.values())
        self.vocab_count = len(self.unigram_model.keys())
    
    def prob(self, words):
        probs = [self.prob_token(sub) for sub in self.tokenize_subwords(words)]
        return reduce((lambda x, y: x * y), probs)
        
    def prob_token(self, token):
        return (self.unigram_model[token]+1)/(self.word_count+1)
    

with open(f'{MODELS_DIR}/unigram-expace/model-v1.pkl','rb') as f:
    uni_model = pickle.load(f)

In [22]:
from lm_scorer.models.auto import GPT2LMScorer

gpt2_tokenizer_id = 'gpt2'
gpt2_model_name = 'expace-v1'
gpt2_model_id = f'{MODELS_DIR}gpt2-{gpt2_model_name}'

scorer = GPT2LMScorer(gpt2_model_id, device='cuda:0', batch_size=32)

In [23]:
def slor(sent):
    log_s = scorer.sentence_score(sent, log=True)
    log_unigram_s = calculate_uni_prob(sent)
    
    return (log_s - log_unigram_s)/len(sent.split())


def calculate_uni_prob(sentence):
    return sum([math.log(uni_model.prob(token)) for token in word_tokenize(sentence.lower())])

def process_slor(file, limit=None, detokenize=False, input_dir=MODEL_VERSION_RESULTS_DIR_PROCESSED, output_dir=MODEL_VERSION_RESULTS_DIR_METRICS, metric_version=None):
    process_file_metric(file, slor, 'slor', limit=limit, detokenize=detokenize, input_dir=input_dir, output_dir=output_dir, metric_version=metric_version)

# Linguistic diversity

In [None]:
from simalign import SentenceAligner

sent_aligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="i", device="cuda")

In [264]:
import tqdm
import hashlib
import pickle
import os
from nltk import word_tokenize

def align_sentence(sent1, sent2, method='itermax'):
    file_name = hashlib.md5((sent1+'-'+sent2).encode('utf-8')).hexdigest()        
    file_path = f'{ALIGNMENT_FOLDER}/{file_name}.pkl'
    if os.path.exists(file_path):
        with open(file_path, 'rb') as f:
            f.seek(0)
            alignments = pickle.load(f)
    else:
        src_sentence = word_tokenize(sent1)
        trg_sentence = word_tokenize(sent2)

        alignments = sent_aligner.get_word_aligns(src_sentence, trg_sentence)
        alignments = alignments[method]
        with open(file_path, 'wb') as f:
            pickle.dump(alignments, f)
    
    return alignments

def get_sentences_same_alignment(pairs):
    selected = []
    
    index = 0
    for pair in tqdm.tqdm(pairs):      
        alignments = align_sentence(pair['orig'], pair['improved'][0])        
        if all([item[0] == item[1] for item in alignments]):            
            selected.append(pair)
            
        index += 1
    return selected

def process_alignment(pairs):
    alignments = []
    
    index = 0
    for pair in tqdm.tqdm(pairs):      
        alignments.append(align_sentence(pair['orig'], pair['improved'][0]))
    return alignments

def get_aligned_sentences(sentences):
    return list(zip(sentences, process_alignment(sentences)))

In [265]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
wn_lemmas = set(wordnet.all_lemma_names())

In [589]:
def syntactical_diversity(sent1, sent2, version=None):
    alignment = align_sentence(sent1, sent2)
    sum_diff = sum([abs(item[0]-item[1]) for item in alignment])
    if version == 'v1':
        return sum_diff/math.sqrt(len(alignment))
    else:
        return sum_diff/len(alignment)

def lexical_diversity(sent1, sent2, return_diffs=False, version=None):
    alignment = align_sentence(sent1, sent2)
    src_sentence = word_tokenize(sent1)
    trg_sentence = word_tokenize(sent2)

    diff_count = 0
    total = 0
    
    diffs = []
    for align in alignment:
        word1 = src_sentence[align[0]]
        word2 = trg_sentence[align[1]]
    
        if word1 == word2:
            continue
            
        total+=1
        
        lemma_orig = wordnet_lemmatizer.lemmatize(word1)
        lemma_repl = wordnet_lemmatizer.lemmatize(word2)
        
        if lemma_orig not in wn_lemmas or lemma_repl not in wn_lemmas:
            continue
            
        if lemma_orig != lemma_repl:
            diffs.append([lemma_orig, lemma_repl])
            diff_count += 1
    score = 0
    
    if total > 0:
        if version == 'v1':
            score = (diff_count)/len(trg_sentence)
        elif version == 'v2':
            score = (diff_count)/math.sqrt(len(trg_sentence))
        else:
            score = (diff_count)/total
    if return_diffs:
        return score, diffs
    
    return score

def process_syntax(file1, file2, limit=None, detokenize=False, output='syntax', input_dir=MODEL_VERSION_RESULTS_DIR_PROCESSED, output_dir=MODEL_VERSION_RESULTS_DIR_METRICS, metric_version=None):
    process_metric_paired(file1, file2, lambda x1, x2: syntactical_diversity(x1, x2, version=metric_version), output, limit=limit, detokenize=detokenize, input_dir=input_dir, output_dir=output_dir, metric_version=metric_version)
    
def process_lexical(file1, file2, limit=None, detokenize=False, output='lexical', input_dir=MODEL_VERSION_RESULTS_DIR_PROCESSED, output_dir=MODEL_VERSION_RESULTS_DIR_METRICS, metric_version=None):
    process_metric_paired(file1, file2, lambda x1, x2: lexical_diversity(x1, x2, version=metric_version), output, limit=limit, detokenize=detokenize, input_dir=input_dir, output_dir=output_dir, metric_version=metric_version)


## Evaluate files

In [None]:
from nltk import word_tokenize 
import torch
from nltk.tokenize.treebank import TreebankWordDetokenizer

import os

def process_file_metric(file, metric, metric_name, limit=None, detokenize=False, input_dir=MODEL_VERSION_RESULTS_DIR_PROCESSED, output_dir=MODEL_VERSION_RESULTS_DIR_METRICS, metric_version=None):    
    if metric_version:
        output_file = f'{output_dir}/{metric_name}-{metric_version}-{file}'
    else:
        output_file = f'{output_dir}/{metric_name}-{file}'
        
    if os.path.exists(output_file):
        print ('=> Already processed', f'{file}')
        return
    if not os.path.exists(f'{input_dir}/{file}'):
        print ('=> Skipping', f'{file}')
        return
        
    perplexities = []
    with open(f'{input_dir}/{file}', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in tqdm(lines[:limit]):
            line = line.strip()
            if detokenize:
                line = TreebankWordDetokenizer().detokenize(word_tokenize(line))
                
            perplexities.append(metric(line))
    
        
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(map(str, perplexities)))
        
def process_metric_paired(file1, file2, metric, metric_name, limit=None, detokenize=False, input_dir=MODEL_VERSION_RESULTS_DIR_PROCESSED, output_dir=MODEL_VERSION_RESULTS_DIR_METRICS, metric_version=None):    
    selected_file = file2.replace('-improved','')
    if metric_version:
        output_file = f'{output_dir}/{metric_name}-{metric_version}-{selected_file}'
    else:
        output_file = f'{output_dir}/{metric_name}-{selected_file}'
        
    if os.path.exists(output_file):
        print ('=> Already processed', f'{metric_name}')
        return
    
    if not os.path.exists(f'{input_dir}/{file1}'):
        print ('=> Skipping', f'{file1}')
        return
    
    if not os.path.exists(f'{input_dir}/{file2}'):
        print ('=> Skipping', f'{file2}')
        return
        
    metrics = []
    
    with open(f'{input_dir}/{file1}', encoding='utf8') as f1, open(f'{input_dir}/{file2}', encoding='utf8') as f2: 
        lines1 = f1.readlines()
        lines2 = f2.readlines()
        
        for sent1, sent2 in tqdm.tqdm(list(zip(lines1[:limit], lines2[:limit]))):
            sent1 = sent1.strip()
            sent2 = sent2.strip()
            
            metrics.append(metric(sent1, sent2))        
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(map(str, metrics)))    
        

In [None]:
import pathlib

LANGUAGES_MULTI = ['pt', 'es']

multi_lingual_files = [   
    'brace-v1-20000.json',      
    'lace-v1-20000.json',     
]

multi_lingual_token_files = get_multi_tokens(multi_lingual_files, LANGUAGES_MULTI)

models_list = [
    'gector-v5',
]

models_multi_list = [
    't5lg-l1aware-multi-s260-v1',
    't5sm-l1aware-multi-s260-v1',    
]

def process_multimodel(models, files, types=['orig', 'improved'], metric='perplexity', metric_version=None, limit=None):   
    for model_version in models:
        print ('=> Model', model_version)
        input_folder = f'{RESULTS_DIR}{MODEL_NAME}/{model_version}/processed/'
        output_folder = f'{RESULTS_DIR}{MODEL_NAME}/{model_version}/metrics/'
        pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True)
        
        for file in files:
            print ('====> Current', file)
            base_file = file.split('.')[0]
            for kind in types:
                if metric == 'perplexity':
                    process_perplexity(base_file+f'-{kind}.txt', input_dir=input_folder, output_dir=output_folder, metric_version=metric_version, limit=limit)
                elif metric == 'slor':
                    process_slor(base_file+f'-{kind}.txt', input_dir=input_folder, output_dir=output_folder, metric_version=metric_version, limit=limit)                    
                elif metric == 'grammar':
                    process_grammaticality(base_file+f'-{kind}.txt', input_dir=input_folder, output_dir=output_folder, metric_version=metric_version, limit=limit)                    
            
            if metric == 'lexical':
                process_lexical(base_file+f'-orig.txt', base_file+f'-improved.txt', input_dir=input_folder, output_dir=output_folder, metric_version=metric_version, limit=limit)                    
            elif metric == 'syntax':
                process_syntax(base_file+f'-orig.txt', base_file+f'-improved.txt', input_dir=input_folder, output_dir=output_folder, metric_version=metric_version, limit=limit)                    
        print()
            

metric_version = None
PROCESS_LIMIT = None

for metric in ['lexical', 'grammar', 'lexical', 'syntactical']:
    process_multimodel(models_list, multi_lingual_files, metric=metric, limit=PROCESS_LIMIT, metric_version=metric_version)
    process_multimodel(models_multi_list, multi_lingual_token_files, metric=metric,limit=PROCESS_LIMIT, metric_version=metric_version)