In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

eat_vf_df = pd.read_excel('./sample_data/ingénieriesEAT-VF-2021030.xlsx').rename(columns={'Gestion et traitement des déjections animales en Italie = ':'fr_title_s'})
set_vf_df = pd.read_excel('./sample_data/SET-VF20210211.xlsx')

def create_gold_standard(df, col):
    gold_standard = []
    for x in df[col]:
        gold_standard.extend(x.split(',')) 
    gold_standard = set([x.strip().lower() for x in gold_standard])
    return gold_standard

gold_standard_eat_en = create_gold_standard(eat_vf_df, 'en_keyword_s')
gold_standard_eat_fr = create_gold_standard(eat_vf_df, 'fr_keyword_s')
gold_standard_set_en = create_gold_standard(set_vf_df, 'Mots clés GB')
gold_standard_set_fr = create_gold_standard(set_vf_df, 'Mots-clés FR')
gold_standard_eat_en.remove('')
gold_standard_eat_fr.remove('')

def create_gold_standard_(df, col):
    gold_standard = []
    for x in df[col]:
        gold_standard.extend(x) 
    gold_standard = set([x.strip().lower() for x in gold_standard])
    return gold_standard

def get_true_kw(df, keywords, corpus, groundtruth):
    df[groundtruth] = pd.Series(dtype='object')
    for i in range(len(df)):
        df[groundtruth].iloc[i] = [x for x in df[keywords].iloc[i].split(',') if x.strip().lower() in df[corpus].iloc[i].lower()]
    return df

eat_vf_df['en_title_abstract'] = eat_vf_df['en_title_s'] + eat_vf_df['en_abstract_s4']
eat_vf_df['fr_title_abstract'] = eat_vf_df['fr_title_s'] + eat_vf_df['fr_abstract_s']
set_vf_df['en_title_abstract'] = set_vf_df['Titre GB'] + set_vf_df['Résumé GB']
set_vf_df['fr_title_abstract'] = set_vf_df['Titre FR'] + set_vf_df['RésuméFR']

eat_vf_df = get_true_kw(eat_vf_df, 'en_keyword_s', 'en_title_abstract', 'en_kw_known')
eat_vf_df = get_true_kw(eat_vf_df, 'fr_keyword_s', 'fr_title_abstract', 'fr_kw_known')
set_vf_df = get_true_kw(set_vf_df,  'Mots clés GB', 'en_title_abstract', 'en_kw_known')
set_vf_df = get_true_kw(set_vf_df, 'Mots-clés FR', 'fr_title_abstract', 'fr_kw_known')


gold_standard_eat_en_ = create_gold_standard_(eat_vf_df, 'en_kw_known')
gold_standard_eat_fr_ = create_gold_standard_(eat_vf_df, 'fr_kw_known')
gold_standard_set_en_ = create_gold_standard_(set_vf_df, 'en_kw_known')
gold_standard_set_fr_ = create_gold_standard_(set_vf_df, 'fr_kw_known')
gold_standard_eat_en_.remove('')
gold_standard_eat_fr_.remove('')

In [2]:
def computeTermEvalMetrics(extracted_terms, gold_df):
    #make lower case cause gold standard is lower case
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=set(gold_df)
    true_pos=extracted_terms.intersection(gold_set)
    recall=round(len(true_pos)*100/len(gold_set),2) if gold_set != 0 else 0
    precision=round(len(true_pos)*100/len(extracted_terms),2) if extracted_terms != 0 else 0
    fscore = round(2*(precision*recall)/(precision+recall),2) if precision + recall != 0 else 0
    print(str(len(extracted_terms))+ ' , ' + str(len(gold_set)) +' , ' + str(len(true_pos)) +' , ' + str(precision)+' , ' +  str(recall)+' , ' +  str(fscore))
    return len(extracted_terms), len(gold_set), len(true_pos), precision, recall, fscore

In [3]:
from rakun2 import RakunKeyphraseDetector

keyword_detector = RakunKeyphraseDetector({"num_keywords": 5,
                                           "merge_threshold": 1.1,
                                           "alpha": 0.3,
                                           "token_prune_len": 3})


In [4]:
def extract_keywords(text, model):
    return [k[0] for k in model.find_keywords(text, input_type="string")]

In [5]:
eat_vf_df['en_preds'] = [extract_keywords(x, keyword_detector) for x in eat_vf_df['en_title_abstract']]
eat_vf_df['fr_preds'] = [extract_keywords(x, keyword_detector) for x in eat_vf_df['fr_title_abstract']]

set_vf_df['en_preds'] = [extract_keywords(x, keyword_detector) for x in set_vf_df['en_title_abstract']]
set_vf_df['fr_preds'] = [extract_keywords(x, keyword_detector) for x in set_vf_df['fr_title_abstract']]

In [7]:
eat_en = []
for li in eat_vf_df['en_preds']:
    eat_en.extend(li)
eat_en = [x.strip().lower() for x in eat_en]  
computeTermEvalMetrics(eat_en, gold_standard_eat_en)
computeTermEvalMetrics(eat_en, gold_standard_eat_en_)

3051 , 1361 , 154 , 5.05 , 11.32 , 6.98
3051 , 595 , 117 , 3.83 , 19.66 , 6.41


(3051, 595, 117, 3.83, 19.66, 6.41)

In [8]:
eat_fr = []
for li in eat_vf_df['fr_preds']:
    eat_fr.extend(li)
eat_fr = [x.strip().lower() for x in eat_fr]  
computeTermEvalMetrics(eat_fr, gold_standard_eat_fr)
computeTermEvalMetrics(eat_fr, gold_standard_eat_fr_)

2935 , 1339 , 146 , 4.97 , 10.9 , 6.83
2935 , 565 , 119 , 4.05 , 21.06 , 6.79


(2935, 565, 119, 4.05, 21.06, 6.79)

In [9]:
set_en = []
for li in set_vf_df['en_preds']:
    set_en.extend(li)
set_en = [x.strip().lower() for x in set_en]  
computeTermEvalMetrics(set_en, gold_standard_set_en)
computeTermEvalMetrics(set_en, gold_standard_set_en_)

1258 , 650 , 68 , 5.41 , 10.46 , 7.13
1258 , 314 , 57 , 4.53 , 18.15 , 7.25


(1258, 314, 57, 4.53, 18.15, 7.25)

In [10]:
set_fr = []
for li in set_vf_df['fr_preds']:
    set_fr.extend(li)
set_fr = [x.strip().lower() for x in set_fr]  
computeTermEvalMetrics(set_fr, gold_standard_set_fr)
computeTermEvalMetrics(set_fr, gold_standard_set_fr_)

1259 , 638 , 56 , 4.45 , 8.78 , 5.91
1259 , 277 , 51 , 4.05 , 18.41 , 6.64


(1259, 277, 51, 4.05, 18.41, 6.64)