In [1]:
# imports généraux
from __future__ import division
import math
import json
import numpy as np
import pandas as pd
import re
import sys
from operator import itemgetter
# imports pour le modèle de langue
import kenlm
# imports pour ibm1
import nltk
from nltk.translate import AlignedSent, IBMModel1
from nltk.tokenize import word_tokenize
# imports pour bleu et meteor
from nltk.translate.bleu_score import SmoothingFunction
from operator import attrgetter
#imports pour camemBERT
from fairseq.models.roberta import CamembertModel

In [2]:
def read_data(ressource_file, maxnbsent):

    data = []

    corpus = json.load(open(ressource_file)) # liste de dictionnaires (src,hyp,ref,score)
    
    totalnbsent = 0        
    for i,sentence in enumerate(corpus) : # sentence = { src = "" , hyp = "" , ref = "", score = "" }
        totalnbsent +=1

        data.append({ 
                            "idx" : i,
                            "src" : nltk.word_tokenize(sentence['src']), 
                            "hyp" : nltk.word_tokenize(sentence['hyp']), 
                            "ref" : nltk.word_tokenize(sentence['ref']), 
                            "score" : round(sentence["score"],2), 
                            "model" : None,
                            "ibm1" : None,
                            "bleu" : None, 
                            "meteor" : None,
                            "cos" : None
                        })

        if maxnbsent > 0 and len(data) >= maxnbsent:
            print( str(len(data))+' phrases chargees')
            return data
    
    return data

In [3]:
def ibm1_score_sentence(e, f) :
        proba = 1
        for wf in f :
            temp = 0
            for we in e :
                temp += ibm1.translation_table[we][wf]
            proba = proba*temp
        return ((1/((len(e)+1)**len(f))) * proba)

In [4]:
def embeddings(sentence):
    #obtenir l'embeddings d'une phrase
    tokens = camembert.encode(sentence)
    all_layers = camembert.extract_features(tokens, return_all_hiddens=True)
    #on prend l'avant dernier layer (on pourrait en choisir un autre)
    pooling_layer = all_layers[-2] 
    #moyenne des vecteurs pour s'affranchir de la longueur des phrases
    embedded = pooling_layer.mean(1) 
    return embedded

def score_camembert(sentA,sentB):
    #renvoie la cosinus similarité entre 2 phrases
    vec_1= embeddings(sentA).detach().numpy().squeeze()
    vec_2 = embeddings(sentB).detach().numpy().squeeze()
    simcos = np.dot(vec_1,vec_2) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))
    return simcos

In [5]:
data = read_data("../ressources/en2fr_manual_evaluation.json",999)

model = kenlm.LanguageModel('../ressources/europarl.binary')
ibm1 = IBMModel1([AlignedSent(d['src'],d['hyp']) for d in data], 20)
bleu_sf = SmoothingFunction()
camembert = CamembertModel.from_pretrained('../camembert.v0')
camembert.eval()

999 phrases chargees
loading archive file ../camembert.v0
| dictionary: 32004 types


RobertaHubInterface(
  (model): RobertaModel(
    (decoder): RobertaEncoder(
      (sentence_encoder): TransformerSentenceEncoder(
        (embed_tokens): Embedding(32005, 768, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 768, padding_idx=1)
        (layers): ModuleList(
          (0): TransformerSentenceEncoderLayer(
            (self_attn): MultiheadAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
            (final_layer_norm): LayerNorm((768

In [10]:
final_dict = {}
for d in data :
    d['model'] = math.pow(10,model.score(" ".join(d["hyp"])))
    d['ibm1'] = ibm1_score_sentence(d['src'],d['hyp'])
    d['bleu'] = nltk.translate.bleu_score.sentence_bleu(d["ref"],d["hyp"],smoothing_function= bleu_sf.method2)
    d['meteor'] = nltk.translate.meteor_score.meteor_score(" ".join(d["ref"])," ".join(d["hyp"]))
    d['cos'] = score_camembert(" ".join(d["ref"])," ".join(d["hyp"]))
    final_dict[d['idx']] = [" ".join(d['src'])," ".join(d['hyp'])," ".join(d['ref']),d['score'],d['model'],d['ibm1'],d['bleu'],d['meteor'],d['cos']]

In [11]:
df=pd.DataFrame.from_dict(final_dict,orient='index')
df.columns=["src","hyp","ref","score_humain","model","ibm1","bleu","meteor","cos"]
df.to_csv("../ressources/metrics.csv")