In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

eat_vf_df = pd.read_excel('./sample_data/ingénieriesEAT-VF-2021030.xlsx').rename(columns={'Gestion et traitement des déjections animales en Italie = ':'fr_title_s'})
set_vf_df = pd.read_excel('./sample_data/SET-VF20210211.xlsx')

def create_gold_standard(df, col):
    gold_standard = []
    for x in df[col]:
        gold_standard.extend(x.split(',')) 
    gold_standard = set([x.strip().lower() for x in gold_standard])
    return gold_standard

gold_standard_eat_en = create_gold_standard(eat_vf_df, 'en_keyword_s')
gold_standard_eat_fr = create_gold_standard(eat_vf_df, 'fr_keyword_s')
gold_standard_set_en = create_gold_standard(set_vf_df, 'Mots clés GB')
gold_standard_set_fr = create_gold_standard(set_vf_df, 'Mots-clés FR')
gold_standard_eat_en.remove('')
gold_standard_eat_fr.remove('')

def create_gold_standard_(df, col):
    gold_standard = []
    for x in df[col]:
        gold_standard.extend(x) 
    gold_standard = set([x.strip().lower() for x in gold_standard])
    return gold_standard

def get_true_kw(df, keywords, corpus, groundtruth):
    df[groundtruth] = pd.Series(dtype='object')
    for i in range(len(df)):
        df[groundtruth].iloc[i] = [x for x in df[keywords].iloc[i].split(',') if x.strip().lower() in df[corpus].iloc[i].lower()]
    return df

eat_vf_df['en_title_abstract'] = eat_vf_df['en_title_s'] + eat_vf_df['en_abstract_s4']
eat_vf_df['fr_title_abstract'] = eat_vf_df['fr_title_s'] + eat_vf_df['fr_abstract_s']
set_vf_df['en_title_abstract'] = set_vf_df['Titre GB'] + set_vf_df['Résumé GB']
set_vf_df['fr_title_abstract'] = set_vf_df['Titre FR'] + set_vf_df['RésuméFR']

eat_vf_df = get_true_kw(eat_vf_df, 'en_keyword_s', 'en_title_abstract', 'en_kw_known')
eat_vf_df = get_true_kw(eat_vf_df, 'fr_keyword_s', 'fr_title_abstract', 'fr_kw_known')
set_vf_df = get_true_kw(set_vf_df,  'Mots clés GB', 'en_title_abstract', 'en_kw_known')
set_vf_df = get_true_kw(set_vf_df, 'Mots-clés FR', 'fr_title_abstract', 'fr_kw_known')


gold_standard_eat_en_ = create_gold_standard_(eat_vf_df, 'en_kw_known')
gold_standard_eat_fr_ = create_gold_standard_(eat_vf_df, 'fr_kw_known')
gold_standard_set_en_ = create_gold_standard_(set_vf_df, 'en_kw_known')
gold_standard_set_fr_ = create_gold_standard_(set_vf_df, 'fr_kw_known')
gold_standard_eat_en_.remove('')
gold_standard_eat_fr_.remove('')

def computeTermEvalMetrics(extracted_terms, gold_df):
    #make lower case cause gold standard is lower case
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=set(gold_df)
    true_pos=extracted_terms.intersection(gold_set)
    recall=round(len(true_pos)*100/len(gold_set),2) if gold_set != 0 else 0
    precision=round(len(true_pos)*100/len(extracted_terms),2) if extracted_terms != 0 else 0
    fscore = round(2*(precision*recall)/(precision+recall),2) if precision + recall != 0 else 0
    print(str(len(extracted_terms))+ ' , ' + str(len(gold_set)) +' , ' + str(len(true_pos)) +' , ' + str(precision)+' , ' +  str(recall)+' , ' +  str(fscore))
    return len(extracted_terms), len(gold_set), len(true_pos), precision, recall, fscore

In [2]:
set_vf_df.columns

Index(['Titre FR', 'Titre GB', 'RésuméFR', 'Résumé GB', 'Mots-clés FR',
       'Mots clés GB', 'en_title_abstract', 'fr_title_abstract', 'en_kw_known',
       'fr_kw_known'],
      dtype='object')

In [3]:
en_eat = eat_vf_df[['en_title_s','en_abstract_s4','en_keyword_s']].rename(columns={'en_title_s':'title',
                                                                                  'en_abstract_s4':'abstract',
                                                                                   'en_keyword_s':'keyword'})
fr_eat = eat_vf_df[['fr_title_s','fr_abstract_s','fr_keyword_s']].rename(columns={'fr_title_s':'title',
                                                                                  'fr_abstract_s':'abstract',
                                                                                  'fr_keyword_s':'keyword'})

en_set = set_vf_df[['Titre GB','Résumé GB', 'Mots clés GB']].rename(columns={'Titre GB':'title',
                                                                             'Résumé GB':'abstract',
                                                                             'Mots clés GB':'keyword'})
fr_set = set_vf_df[['Titre FR', 'RésuméFR', 'Mots-clés FR']].rename(columns={'Titre FR':'title', 
                                                                            'RésuméFR':'abstract',
                                                                             'Mots-clés FR':'keyword'})

In [4]:
en_eat['keyword'] = [x.lower().replace(',',';') for x in en_eat['keyword']]
fr_eat['keyword'] = [x.lower().replace(',',';') for x in fr_eat['keyword'] ]
en_set['keyword'] = [x.lower().replace(',',';') for x in en_set['keyword'] ]
fr_set['keyword'] = [x.lower().replace(',',';') for x in fr_set['keyword'] ]

In [8]:
en_eat.to_json('./tntKID_test_data/en_eat.json', orient='records', lines=True)
fr_eat.to_json('./tntKID_test_data/fr_eat.json', orient='records', lines=True)
en_set.to_json('./tntKID_test_data/en_set.json', orient='records', lines=True)
fr_set.to_json('./tntKID_test_data/fr_set.json', orient='records', lines=True)

In [9]:
pwd

'/home/tranthh/termirad/termiraid_ate'

In [14]:
en_eat_preds = pd.read_csv('./tntKid_preds/en_eat_preds.csv', delimiter='\t')
fr_eat_preds = pd.read_csv('./tntKid_preds/fr_eat_preds.csv', delimiter='\t')
en_set_preds = pd.read_csv('./tntKid_preds/en_set_preds.csv', delimiter='\t')
fr_set_preds = pd.read_csv('./tntKid_preds/fr_set_preds.csv', delimiter='\t')

In [19]:
def pred_list(df):
    preds = []
    for x in [str(x).split(';') for x in df.keywords]:
        preds.extend(x)
    return preds

In [21]:
computeTermEvalMetrics(pred_list(en_eat_preds), gold_standard_eat_en)
computeTermEvalMetrics(pred_list(fr_eat_preds), gold_standard_eat_fr)
computeTermEvalMetrics(pred_list(en_set_preds), gold_standard_set_en)
computeTermEvalMetrics(pred_list(fr_set_preds), gold_standard_set_fr)

2276 , 1361 , 339 , 14.89 , 24.91 , 18.64
945 , 1339 , 64 , 6.77 , 4.78 , 5.6
1076 , 650 , 139 , 12.92 , 21.38 , 16.11
712 , 638 , 35 , 4.92 , 5.49 , 5.19


(712, 638, 35, 4.92, 5.49, 5.19)

In [22]:
computeTermEvalMetrics(pred_list(en_eat_preds), gold_standard_eat_en_)
computeTermEvalMetrics(pred_list(fr_eat_preds), gold_standard_eat_fr_)
computeTermEvalMetrics(pred_list(en_set_preds), gold_standard_set_en_)
computeTermEvalMetrics(pred_list(fr_set_preds), gold_standard_set_fr_)

2276 , 595 , 282 , 12.39 , 47.39 , 19.64
945 , 565 , 57 , 6.03 , 10.09 , 7.55
1076 , 314 , 124 , 11.52 , 39.49 , 17.84
712 , 277 , 30 , 4.21 , 10.83 , 6.06


(712, 277, 30, 4.21, 10.83, 6.06)

In [23]:
en_eat_preds_kptimes = pd.read_csv('./tntKid_preds/en_eat_preds_kptimes.csv', delimiter='\t')
fr_eat_preds_kptimes = pd.read_csv('./tntKid_preds/fr_eat_preds_kptimes.csv', delimiter='\t')
en_set_preds_kptimes = pd.read_csv('./tntKid_preds/en_set_preds_kptimes.csv', delimiter='\t')
fr_set_preds_kptimes = pd.read_csv('./tntKid_preds/fr_set_preds_kptimes.csv', delimiter='\t')

In [24]:
computeTermEvalMetrics(pred_list(en_eat_preds_kptimes), gold_standard_eat_en)
computeTermEvalMetrics(pred_list(fr_eat_preds_kptimes), gold_standard_eat_fr)
computeTermEvalMetrics(pred_list(en_set_preds_kptimes), gold_standard_set_en)
computeTermEvalMetrics(pred_list(fr_set_preds_kptimes), gold_standard_set_fr)

computeTermEvalMetrics(pred_list(en_eat_preds_kptimes), gold_standard_eat_en_)
computeTermEvalMetrics(pred_list(fr_eat_preds_kptimes), gold_standard_eat_fr_)
computeTermEvalMetrics(pred_list(en_set_preds_kptimes), gold_standard_set_en_)
computeTermEvalMetrics(pred_list(fr_set_preds_kptimes), gold_standard_set_fr_)

450 , 1361 , 96 , 21.33 , 7.05 , 10.6
104 , 1339 , 12 , 11.54 , 0.9 , 1.67
260 , 650 , 42 , 16.15 , 6.46 , 9.23
80 , 638 , 4 , 5.0 , 0.63 , 1.12
450 , 595 , 86 , 19.11 , 14.45 , 16.46
104 , 565 , 11 , 10.58 , 1.95 , 3.29
260 , 314 , 37 , 14.23 , 11.78 , 12.89
80 , 277 , 4 , 5.0 , 1.44 , 2.24


(80, 277, 4, 5.0, 1.44, 2.24)

In [1]:
import pandas as pd
test = pd.read_csv('/home/tranthh/termirad/termiraid_ate/tntKid_preds/en_eat_preds_kptimes.csv', delimiter='\t')
test.head(20)

Unnamed: 0.1,Unnamed: 0,text,keywords
0,0,Development of the Rhône and reserved flow. Op...,
1,1,"Amenity: quality in the social relationships, ...",political
2,2,"Amenities, non-market functions and forest man...",
3,3,Relationships with other abiotic and biotic ec...,algae;water
4,4,Linking between different scales of space and ...,environment;space
5,5,Nonmarket benefits from Brest natural harbour ...,water;france;brest natural;agricultural
6,6,Assessment of the workings of a pig slurry tre...,pig
7,7,Environmental assessement of landfill GHG emis...,evolution;greenhouse gas emissions;landfills
8,8,Natural reforestation and growth of brushwood ...,natural;agricultural
9,9,"Design, implementation and operation of sand f...",filters;france
