In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keybert import KeyBERT
CUDA_LAUNCH_BLOCKING=1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
eat_vf_df = pd.read_excel('./sample_data/ingénieriesEAT-VF-2021030.xlsx').rename(columns={'Gestion et traitement des déjections animales en Italie = ':'fr_title_s'})
set_vf_df = pd.read_excel('./sample_data/SET-VF20210211.xlsx')

In [3]:
eat_vf_df.head(3)

Unnamed: 0,docType_s,halId_s,fr_title_s,en_title_s,fr_abstract_s,en_abstract_s4,fr_keyword_s,en_keyword_s,producedDate_s,volume_s,issue_s,uri_s
0,ART,hal-02582981,Aménagement du Rhône et débit réservé,Development of the Rhône and reserved flow,Les opérations d'augmentation de débits réserv...,Operations to increase the reserve flows and t...,"Aménagement de cours d'eau, débit réservé, éne...","River management, instream flow, hydroelectric...",2004.0,,38 supplément,https://hal.inrae.fr/hal-02582981
1,ART,hal-02580960,"Aménité : qualité des relations sociales, qual...","Amenity: quality in the social relationships, ...","Dans un contexte où le mot ""aménités"" circule ...",In a context where the word 'amenities' is use...,"Aménité, sciences humaines et sociales, dévelo...","Amenity, human and social sciences, rural deve...",2002.0,,spécial Aménités rurales : une nouvelle lectur...,https://hal.inrae.fr/hal-02580960
2,ART,hal-02580702,"Aménités, fonctions non marchandes et aménagem...","Amenities, non-market functions and forest man...",La prise en compte des fonctions non marchande...,Taking into account non-market functions in fo...,"Multifonctionnalité de la forêt, valeur non ma...","Forest multifunctionality, non-market value, a...",2002.0,,spécial Aménagement forestier,https://hal.inrae.fr/hal-02580702


In [4]:
set_vf_df.head(3)

Unnamed: 0,Titre FR,Titre GB,RésuméFR,Résumé GB,Mots-clés FR,Mots clés GB
0,Une approche innovante de modélisation du risq...,An innovative approach to modelling forest fir...,Une méthode de cartographie des interfaces hab...,A method to characterize and to map wildland-u...,"Interface habitat-forêt, risque d’incendie, dé...","Wildland-urban interface, fire risk, fire star..."
1,Mise en œuvre de deux mesures agrienvironnemen...,Implementation of two result-based agrienviron...,Afin de répondre aux exigences de la société e...,In order to meet society's demands for a more ...,"Mesure agroenvironnementale, prairie fleurie, ...","Agrienvironmental measure, flowering meadow, p..."
2,Les enjeux de l’équivalence écologique pour la...,The issue of ecological equivalence in designi...,L’évolution du contexte réglementaire a renfor...,Changes in the regulatory context have reinfor...,"Compensation écologique, impact sur la biodive...","Ecological compensation, impact on biodiversit..."


In [5]:
def create_gold_standard(df, col):
    gold_standard = []
    for x in df[col]:
        gold_standard.extend(x.split(',')) 
    gold_standard = set([x.strip().lower() for x in gold_standard])
    return gold_standard

gold_standard_eat_en = create_gold_standard(eat_vf_df, 'en_keyword_s')
gold_standard_eat_fr = create_gold_standard(eat_vf_df, 'fr_keyword_s')
gold_standard_set_en = create_gold_standard(set_vf_df, 'Mots clés GB')
gold_standard_set_fr = create_gold_standard(set_vf_df, 'Mots-clés FR')
gold_standard_eat_en.remove('')
gold_standard_eat_fr.remove('')

In [6]:
def extract_keywords(text, model, thres=0.6):
    kw_model = KeyBERT(model=model)
    ngram = [(1,1), (2,2), (3,3), (4,4), (5,5)] #, (6,6), (7,7), (8,8), (9,9), (10,10)]
    keywords = []
    for n in ngram:
        keywords.extend(kw_model.extract_keywords(text, keyphrase_ngram_range = n))
    return [k[0] for k in keywords if k[1] > thres]

In [7]:
eat_vf_df['en_title_abstract'] = eat_vf_df['en_title_s'] + eat_vf_df['en_abstract_s4']
eat_vf_df['fr_title_abstract'] = eat_vf_df['fr_title_s'] + eat_vf_df['fr_abstract_s']

In [None]:
eat_vf_df['en_preds'] = [extract_keywords(x, 'all-MiniLM-L6-v2') for x in eat_vf_df['en_title_abstract']]

In [None]:
eat_vf_df['fr_preds'] = [extract_keywords(x, 'paraphrase-multilingual-MiniLM-L12-v2') for x in eat_vf_df['fr_title_abstract']]

In [None]:
en_preds = []
for li in eat_vf_df['en_preds']:
    en_preds.extend(li)

In [None]:
fr_preds = []
for li in eat_vf_df['fr_preds']:
    fr_preds.extend(li)

In [None]:
def computeTermEvalMetrics(extracted_terms, gold_df):
    #make lower case cause gold standard is lower case
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=set(gold_df)
    true_pos=extracted_terms.intersection(gold_set)
    recall=round(len(true_pos)*100/len(gold_set),2) if gold_set != 0 else 0
    precision=round(len(true_pos)*100/len(extracted_terms),2) if extracted_terms != 0 else 0
    fscore = round(2*(precision*recall)/(precision+recall),2) if precision + recall != 0 else 0

    print("Extracted",len(extracted_terms))
    print("Gold",len(gold_set))
    print("Intersection",len(true_pos))
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", fscore)

    print(str(len(extracted_terms))+ ' | ' + str(len(gold_set)) +' | ' + str(len(true_pos)) +' | ' + str(precision)+' & ' +  str(recall)+' & ' +  str(fscore))
    return len(extracted_terms), len(gold_set), len(true_pos), precision, recall, fscore

In [None]:
computeTermEvalMetrics(en_preds, gold_standard_eat_en)

In [None]:
computeTermEvalMetrics(fr_preds, gold_standard_eat_fr)

In [None]:
set_vf_df.columns

In [None]:
set_vf_df['en_title_abstract'] = set_vf_df['Titre GB'] + set_vf_df['Résumé GB']
set_vf_df['fr_title_abstract'] = set_vf_df['Titre FR'] + set_vf_df['RésuméFR']

In [None]:
set_vf_df['en_preds'] = [extract_keywords(x, 'all-MiniLM-L6-v2') for x in set_vf_df['en_title_abstract']]
set_vf_df['fr_preds'] = [extract_keywords(x, 'paraphrase-multilingual-MiniLM-L12-v2') for x in set_vf_df['fr_title_abstract']]

In [None]:
en_preds_ = []
for li in set_vf_df['en_preds']:
    en_preds_.extend(li)
computeTermEvalMetrics(en_preds_, gold_standard_set_en)

In [None]:
fr_preds_ = []
for li in set_vf_df['fr_preds']:
    fr_preds_.extend(li)
computeTermEvalMetrics(fr_preds_, gold_standard_set_fr)

In [None]:
def get_true_kw(df, keywords, corpus, groundtruth):
    df[groundtruth] = pd.Series(dtype='object')
    for i in range(len(df)):
        df[groundtruth].iloc[i] = [x for x in df[keywords].iloc[i].split(',') if x.strip().lower() in df[corpus].iloc[i].lower()]
    return df

In [None]:
set_vf_df.columns

In [None]:
eat_vf_df = get_true_kw(eat_vf_df, 'en_keyword_s', 'en_title_abstract', 'en_kw_known')
eat_vf_df = get_true_kw(eat_vf_df, 'fr_keyword_s', 'fr_title_abstract', 'fr_kw_known')
set_vf_df = get_true_kw(set_vf_df,  'Mots clés GB', 'en_title_abstract', 'en_kw_known')
set_vf_df = get_true_kw(set_vf_df, 'Mots-clés FR', 'fr_title_abstract', 'fr_kw_known')

In [None]:
eat_en_gt = []
for li in eat_vf_df['en_kw_known']:
    eat_en_gt.extend(li)
eat_en_gt = [x.strip().lower() for x in eat_en_gt]  
eat_en_gt.remove('')
computeTermEvalMetrics(en_preds, eat_en_gt)

In [None]:
eat_fr_gt = []
for li in eat_vf_df['fr_kw_known']:
    eat_fr_gt.extend(li)
eat_fr_gt = [x.strip().lower() for x in eat_fr_gt]  
eat_fr_gt.remove('')
computeTermEvalMetrics(fr_preds, eat_fr_gt)

In [None]:
set_en_gt = []
for li in set_vf_df['en_kw_known']:
    set_en_gt.extend(li)
set_en_gt = [x.strip().lower() for x in set_en_gt]  
set_en_gt.remove('')
computeTermEvalMetrics(en_preds_, set_en_gt)

In [None]:
set_fr_gt = []
for li in set_vf_df['fr_kw_known']:
    set_fr_gt.extend(li)
set_fr_gt = [x.strip().lower() for x in set_fr_gt] 
set_fr_gt.remove('')
computeTermEvalMetrics(fr_preds_, set_fr_gt)