# Sanity check

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
eat_vf_df = pd.read_excel('./sample_data/ingénieriesEAT-VF-2021030.xlsx').rename(columns={'Gestion et traitement des déjections animales en Italie = ':'fr_title_s'})
set_vf_df = pd.read_excel('./sample_data/SET-VF20210211.xlsx')

In [3]:
eat_vf_df.info()
eat_vf_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696 entries, 0 to 695
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   docType_s       696 non-null    object
 1   halId_s         696 non-null    object
 2   fr_title_s      696 non-null    object
 3   en_title_s      696 non-null    object
 4   fr_abstract_s   696 non-null    object
 5   en_abstract_s4  696 non-null    object
 6   fr_keyword_s    696 non-null    object
 7   en_keyword_s    696 non-null    object
 8   producedDate_s  696 non-null    object
 9   volume_s        9 non-null      object
 10  issue_s         688 non-null    object
 11  uri_s           696 non-null    object
dtypes: object(12)
memory usage: 65.4+ KB


Unnamed: 0,docType_s,halId_s,fr_title_s,en_title_s,fr_abstract_s,en_abstract_s4,fr_keyword_s,en_keyword_s,producedDate_s,volume_s,issue_s,uri_s
0,ART,hal-02582981,Aménagement du Rhône et débit réservé,Development of the Rhône and reserved flow,Les opérations d'augmentation de débits réserv...,Operations to increase the reserve flows and t...,"Aménagement de cours d'eau, débit réservé, éne...","River management, instream flow, hydroelectric...",2004.0,,38 supplément,https://hal.inrae.fr/hal-02582981
1,ART,hal-02580960,"Aménité : qualité des relations sociales, qual...","Amenity: quality in the social relationships, ...","Dans un contexte où le mot ""aménités"" circule ...",In a context where the word 'amenities' is use...,"Aménité, sciences humaines et sociales, dévelo...","Amenity, human and social sciences, rural deve...",2002.0,,spécial Aménités rurales : une nouvelle lectur...,https://hal.inrae.fr/hal-02580960


In [4]:
set_vf_df.info()
set_vf_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Titre FR      287 non-null    object
 1   Titre GB      287 non-null    object
 2   RésuméFR      287 non-null    object
 3   Résumé GB     287 non-null    object
 4   Mots-clés FR  287 non-null    object
 5   Mots clés GB  287 non-null    object
dtypes: object(6)
memory usage: 13.6+ KB


Unnamed: 0,Titre FR,Titre GB,RésuméFR,Résumé GB,Mots-clés FR,Mots clés GB
0,Une approche innovante de modélisation du risq...,An innovative approach to modelling forest fir...,Une méthode de cartographie des interfaces hab...,A method to characterize and to map wildland-u...,"Interface habitat-forêt, risque d’incendie, dé...","Wildland-urban interface, fire risk, fire star..."
1,Mise en œuvre de deux mesures agrienvironnemen...,Implementation of two result-based agrienviron...,Afin de répondre aux exigences de la société e...,In order to meet society's demands for a more ...,"Mesure agroenvironnementale, prairie fleurie, ...","Agrienvironmental measure, flowering meadow, p..."


In [5]:
def create_gold_standard(df, col):
    gold_standard = []
    for x in df[col]:
        gold_standard.extend(x.split(',')) 
    gold_standard = set([x.strip().lower() for x in gold_standard])
    return gold_standard

def create_gold_standard_(df, col):
    gold_standard = []
    for x in df[col]:
        gold_standard.extend(x) 
    gold_standard = set([x.strip().lower() for x in gold_standard])
    return gold_standard

In [6]:
gold_standard_eat_en = create_gold_standard(eat_vf_df, 'en_keyword_s')
gold_standard_eat_fr = create_gold_standard(eat_vf_df, 'fr_keyword_s')
gold_standard_set_en = create_gold_standard(set_vf_df, 'Mots clés GB')
gold_standard_set_fr = create_gold_standard(set_vf_df, 'Mots-clés FR')
gold_standard_eat_en.remove('')
gold_standard_eat_fr.remove('')

In [7]:
def get_true_kw(df, keywords, corpus, groundtruth):
    df[groundtruth] = pd.Series(dtype='object')
    for i in range(len(df)):
        df[groundtruth].iloc[i] = [x for x in df[keywords].iloc[i].split(',') if x.strip().lower() in df[corpus].iloc[i].lower()]
    return df

eat_vf_df['en_title_abstract'] = eat_vf_df['en_title_s'] + eat_vf_df['en_abstract_s4']
eat_vf_df['fr_title_abstract'] = eat_vf_df['fr_title_s'] + eat_vf_df['fr_abstract_s']
set_vf_df['en_title_abstract'] = set_vf_df['Titre GB'] + set_vf_df['Résumé GB']
set_vf_df['fr_title_abstract'] = set_vf_df['Titre FR'] + set_vf_df['RésuméFR']

eat_vf_df = get_true_kw(eat_vf_df, 'en_keyword_s', 'en_title_abstract', 'en_kw_known')
eat_vf_df = get_true_kw(eat_vf_df, 'fr_keyword_s', 'fr_title_abstract', 'fr_kw_known')
set_vf_df = get_true_kw(set_vf_df,  'Mots clés GB', 'en_title_abstract', 'en_kw_known')
set_vf_df = get_true_kw(set_vf_df, 'Mots-clés FR', 'fr_title_abstract', 'fr_kw_known')

In [8]:
gold_standard_eat_en_ = create_gold_standard_(eat_vf_df, 'en_kw_known')
gold_standard_eat_fr_ = create_gold_standard_(eat_vf_df, 'fr_kw_known')
gold_standard_set_en_ = create_gold_standard_(set_vf_df, 'en_kw_known')
gold_standard_set_fr_ = create_gold_standard_(set_vf_df, 'fr_kw_known')
gold_standard_eat_en_.remove('')
gold_standard_eat_fr_.remove('')

# Inference

In [9]:
# import os
# import json
# import pickle
# import argparse
# import pandas as pd
# from transformers import AutoTokenizer, AutoModelForTokenClassification   
# from transformers import Trainer, TrainingArguments
# from transformers import EarlyStoppingCallback

# from utils import *
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning)

# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)

#     # Remove ignored index (special tokens)
#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     extracted_terms = extract_terms(true_predictions, val)
#     extracted_terms = set([item.lower() for item in extracted_terms])
#     gold_set=gold_validation     


#     true_pos=extracted_terms.intersection(gold_set)
    
#     recall=len(true_pos)/len(gold_set) if len(gold_set) != 0 else 0
#     precision=len(true_pos)/len(extracted_terms) if len(extracted_terms) != 0 else 0
#     f1 = 2*(precision*recall)/(precision+recall) if (precision+recall) != 0 else 0

#     return {
#         "precision": precision,
#         "recall": recall,
#         "f1": f1,
#     }

# label_list=["O", "B", "I"]
# label_to_id = {l: i for i, l in enumerate(label_list)}
# tokenizer = AutoTokenizer.from_pretrained('./pretrained_models/model_en_roberta/')
# model = AutoModelForTokenClassification.from_pretrained('./pretrained_models/model_en_roberta/', num_labels=len(label_list))

# test_args = TrainingArguments(
#     output_dir= './',          # output directory
#     do_train = False,
#     do_predict = True,
#     per_device_eval_batch_size=32,   # batch size for evaluation  
#     dataloader_drop_last = False   
# )

# # init trainer
# trainer = Trainer(
#               model = model, 
#               args = test_args, 
#               compute_metrics = compute_metrics)

In [10]:
# texts = [x.split() for x in eat_vf_df['en_title_abstract']]
# tags =  [['O']*len(x.split()) for x in eat_vf_df['en_title_abstract']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_en))
# computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_en_))

In [11]:
# texts = [x.split() for x in set_vf_df['en_title_abstract']]
# tags =  [['O']*len(x.split()) for x in set_vf_df['en_title_abstract']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_set_en))
# computeTermEvalMetrics(extracted_terms, set(gold_standard_set_en_))

In [12]:
# texts = [x.split() for x in eat_vf_df['en_abstract_s4']]
# tags =  [['O']*len(x.split()) for x in eat_vf_df['en_abstract_s4']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_en))

In [13]:
# texts = [x.split() for x in eat_vf_df['en_title_s']]
# tags =  [['O']*len(x.split()) for x in eat_vf_df['en_title_s']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_en))

In [14]:
# texts = [x.split() for x in set_vf_df['Titre GB']]
# tags =  [['O']*len(x.split()) for x in set_vf_df['Titre GB']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_set_en))

In [15]:
# texts = [x.split() for x in set_vf_df['Résumé GB']]
# tags =  [['O']*len(x.split()) for x in set_vf_df['Résumé GB']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_set_en))

# FR

In [16]:
import os
import json
import pickle
import argparse
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification   
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

from utils import *
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms = extract_terms(true_predictions, val)
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_validation     


    true_pos=extracted_terms.intersection(gold_set)
    
    recall=len(true_pos)/len(gold_set) if len(gold_set) != 0 else 0
    precision=len(true_pos)/len(extracted_terms) if len(extracted_terms) != 0 else 0
    f1 = 2*(precision*recall)/(precision+recall) if (precision+recall) != 0 else 0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

label_list=["O", "B", "I"]
label_to_id = {l: i for i, l in enumerate(label_list)}
tokenizer = AutoTokenizer.from_pretrained('./pretrained_models/model_fr_camembert/')
model = AutoModelForTokenClassification.from_pretrained('./pretrained_models/model_fr_camembert/', num_labels=len(label_list))

test_args = TrainingArguments(
    output_dir= './',          # output directory
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size=32,   # batch size for evaluation  
    dataloader_drop_last = False   
)

# init trainer
trainer = Trainer(
              model = model, 
              args = test_args, 
              compute_metrics = compute_metrics)

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
texts = [x.split() for x in eat_vf_df['fr_title_abstract']]
tags =  [['O']*len(x.split()) for x in eat_vf_df['fr_title_abstract']]
val = texts
gold_validation = 'O'
input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
dataset = OurDataset(input_and_labels, input_and_labels["labels"])
predictions, labels, metrics = trainer.predict(dataset)
predictions1 = np.argmax(predictions, axis=2)
true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
                    for prediction1, label in zip(predictions1, labels)]
extracted_terms = extract_terms(true_predictions, texts)
computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_fr))
computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_fr_))

***** Running Prediction *****
  Num examples = 696
  Batch size = 32


Extracted 2791
Gold 1339
Intersection 233
Precision: 8.35
Recall: 17.4
F1: 11.28
2791 | 1339 | 233 | 8.35 & 17.4 & 11.28
Extracted 2791
Gold 565
Intersection 202
Precision: 7.24
Recall: 35.75
F1: 12.04
2791 | 565 | 202 | 7.24 & 35.75 & 12.04


(2791, 565, 202, 7.24, 35.75, 12.04)

In [18]:
texts = [x.split() for x in set_vf_df['fr_title_abstract']]
tags =  [['O']*len(x.split()) for x in set_vf_df['fr_title_abstract']]
val = texts
gold_validation = 'O'
input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
dataset = OurDataset(input_and_labels, input_and_labels["labels"])
predictions, labels, metrics = trainer.predict(dataset)
predictions1 = np.argmax(predictions, axis=2)
true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
                    for prediction1, label in zip(predictions1, labels)]
extracted_terms = extract_terms(true_predictions, texts)
computeTermEvalMetrics(extracted_terms, set(gold_standard_set_fr))
computeTermEvalMetrics(extracted_terms, set(gold_standard_set_fr_))

***** Running Prediction *****
  Num examples = 287
  Batch size = 32


Extracted 1540
Gold 638
Intersection 112
Precision: 7.27
Recall: 17.55
F1: 10.28
1540 | 638 | 112 | 7.27 & 17.55 & 10.28
Extracted 1540
Gold 277
Intersection 106
Precision: 6.88
Recall: 38.27
F1: 11.66
1540 | 277 | 106 | 6.88 & 38.27 & 11.66


(1540, 277, 106, 6.88, 38.27, 11.66)

In [19]:
# texts = [x.split() for x in eat_vf_df['fr_abstract_s']]
# tags =  [['O']*len(x.split()) for x in eat_vf_df['fr_abstract_s']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_fr))

In [20]:
# texts = [x.split() for x in eat_vf_df['fr_title_s']]
# tags =  [['O']*len(x.split()) for x in eat_vf_df['fr_title_s']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_eat_fr))

In [21]:
# texts = [x.split() for x in set_vf_df['Titre FR']]
# tags =  [['O']*len(x.split()) for x in set_vf_df['Titre FR']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_set_fr))

In [22]:
# texts = [x.split() for x in set_vf_df['RésuméFR']]
# tags =  [['O']*len(x.split()) for x in set_vf_df['RésuméFR']]
# val = texts
# gold_validation = 'O'
# input_and_labels = tokenize_and_align_labels(texts, tags, tokenizer, label_to_id)
# dataset = OurDataset(input_and_labels, input_and_labels["labels"])
# predictions, labels, metrics = trainer.predict(dataset)
# predictions1 = np.argmax(predictions, axis=2)
# true_predictions = [[label_list[p] for (p, l) in zip(prediction1, label) if l != -100]
#                     for prediction1, label in zip(predictions1, labels)]
# extracted_terms = extract_terms(true_predictions, texts)
# computeTermEvalMetrics(extracted_terms, set(gold_standard_set_fr))