In [1]:
import timeit
import numpy as np
import pandas as pd
from transformers import TokenClassificationPipeline, AutoModelForTokenClassification,Text2TextGenerationPipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers.pipelines import AggregationStrategy
import warnings
warnings.filterwarnings("ignore")

In [2]:
eat_df = pd.read_excel('./termirad_data/ingénieriesEAT-VF-2021030.xlsx').rename(columns={'Gestion et traitement des déjections animales en Italie = ':'fr_title_s'})
set_df = pd.read_excel('./termirad_data/SET-VF20210211.xlsx')
# eat_df.head(2), set_df.head(5)

In [3]:
def preprocess(df, eat = True):
    if eat == True:
        df['en_texts'] = df['en_title_s']  + ' ' + df['en_abstract_s4']
        df['fr_texts'] = df['fr_title_s']  + ' ' + df['fr_abstract_s']
        return df[['uri_s', 'en_texts', 'fr_texts',
                   'fr_keyword_s', 'en_keyword_s']].rename(columns={'fr_keyword_s':'fr_keywords',
                                                                    'en_keyword_s':'en_keywords'})
    else:
        df['en_texts'] = df['Titre GB']  + ' ' + df['Résumé GB']
        df['fr_texts'] = df['Titre FR']  + ' ' + df['RésuméFR']
        return df[['en_texts', 'fr_texts',
                   'Mots-clés FR', 'Mots clés GB']].rename(columns={'Mots-clés FR':'fr_keywords',
                                                                    'Mots clés GB':'en_keywords'})
    return df

In [4]:
eat_data = preprocess(eat_df)
set_data = preprocess(set_df, eat = False)

## Report

We consider our problem as token classification with B-I-O annotation like NER and evaluate the baseline of extracting keywords with Transformer-based language models in two settings:
- **Extractive keywords**: use encoder-only models to interpret the document.
- **Abstractive keywords**: use an encoder-decoder model (e.g. BART, T5) to generate keywords. 
All models are pretrained in English but we will evaluate on both English and French to see the impact of cross-lingual ability as well.

**Reference**: 
- [bloomberg/KBIR](https://huggingface.co/bloomberg/KBIR) from [Learning Rich Representations of Keyphrases from Text](https://aclanthology.org/2022.findings-naacl.67.pdf).
- [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased?text=The+goal+of+life+is+%5BMASK%5D.) from [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/pdf/1910.01108.pdf).
- [bloomberg/KeyBART](https://huggingface.co/bloomberg/KeyBART) from [Learning Rich Representations of Keyphrases from Text](https://aclanthology.org/2022.findings-naacl.67.pdf) .
- [t5-small](https://huggingface.co/t5-small?text=My+name+is+Sarah+and+I+live+in+London) from [Exploring Transfer Learning with T5: the Text-To-Text Transfer Transformer](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html).

The summary of results demonstrates in the following table with the model name follows the syntax `type_model_dataset`.

### English version

|Types|Model|EAT Precision|EAT Recall|EAT F1-score|SET Precision|SET Recall|SET F1-score|
|---|---|---|---|---|---|---|---|
|**Extraction**|[TNT-KID KP20k](https://github.com/EMBEDDIA/tnt_kid)| 12.39| **47.39**| **19.64**|11.52 | **39.49** | **17.84**|
||[keyphrase-extraction-kbir-inspec](https://huggingface.co/ml6team/keyphrase-extraction-kbir-inspec)| 8.07 | 7.97 | 12.53|7.30 | 23.38 | 11.13|
||[keyphrase-extraction-distilbert-kptimes](https://huggingface.co/ml6team/keyphrase-extraction-distilbert-kptimes)|  17.6 | 3.01 | 5.14|9.83 | 2.62 | 4.14|
||[keyphrase-extraction-kbir-kpcrowd](https://huggingface.co/ml6team/keyphrase-extraction-kbir-kpcrowd)| 5.44 | 36.05 | 9.45|4.30 | 32.46 | 7.59|
|**Generation**|[keyphrase-generation-keybart-inspec](https://huggingface.co/ml6team/keyphrase-generation-keybart-inspec)|10.71 | 20.7 | 14.12|-|-|-|
||[keyphrase-generation-t5-small-inspec](https://huggingface.co/ml6team/keyphrase-generation-t5-small-inspec)|8.42 | 9.25 | 8.82| 8.99 | 12.0 | 10.28|
||[keyphrase-generation-t5-small-openkp](https://huggingface.co/ml6team/keyphrase-generation-t5-small-openkp)|**14.61** | 13.88 | **14.24**|**12.82** | 11.54 | **12.15**|
|---|---|---|---|---|---|---|---|

### French version

|Types|Model|EAT Precision|EAT Recall|EAT F1-score|SET Precision|SET Recall|SET F1-score|
|---|---|---|---|---|---|---|---|
|**Extraction**|[CamemBERT]()|7.24 |**35.75**| **12.04**|6.88| **38.27**| **11.66**| 
||[keyphrase-extraction-kbir-inspec](https://huggingface.co/ml6team/keyphrase-extraction-kbir-inspec) |8.02 | 13.06 | 9.94|4.82 | 9.56 | 6.41|
||[keyphrase-extraction-distilbert-kptimes](https://huggingface.co/ml6team/keyphrase-extraction-distilbert-kptimes)| 9.68 | 0.22 | 0.43|6.67 | 0.16 | 0.31|
||[keyphrase-extraction-kbir-kpcrowd](https://huggingface.co/ml6team/keyphrase-extraction-kbir-kpcrowd)|4.26 | 26.42 | 7.34|2.84 | 21.63 | 5.02|
|**Generation**|[keyphrase-generation-t5-small-inspec](https://huggingface.co/ml6team/keyphrase-generation-t5-small-inspec)|8.42| 9.25 | 8.82|7.14 | 7.52 | 7.33|
||[keyphrase-generation-t5-small-openkp](https://huggingface.co/ml6team/keyphrase-generation-t5-small-openkp)|12.29 | 9.33 | **10.61**| 8.78 | 6.43 | **7.42**|
|---|---|---|---|---|---|---|---|


**OpenKeyPhrase(OpenKP)**: The dataset features 148,124 real world web documents along with a human annotation indicating the 1-3 most relevant keyphrases (https://github.com/microsoft/OpenKP).

### 1. Extract and generate the keywords

In [22]:
# Define keyword extraction pipeline
class KeywordExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])
    
class KeywordGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyword_sep_token=";", *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    model_max_length=1024, 
                                                    padding='max_length', 
                                                    truncation=True),
            *args,
            **kwargs
        )
        self.keyword_sep_token = keyword_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs
        )
        return [[keyword.strip() for keyword in result.get("generated_text").split(self.keyword_sep_token) if keyword != ""] for result in results]


In [6]:
extractive_model_list = ['keyphrase-extraction-kbir-inspec',
                        # 'keyphrase-extraction-distilbert-inspec',
                        # 'keyphrase-extraction-kbir-kptimes',
                        'keyphrase-extraction-distilbert-kptimes',
                        # 'keyphrase-extraction-kbir-semeval2017',
                        'keyphrase-extraction-kbir-kpcrowd'
                        ]
abstractive_model_list = ['keyphrase-generation-keybart-inspec',
                        'keyphrase-generation-t5-small-inspec',
                        'keyphrase-generation-t5-small-openkp'
                        ]

In [28]:
def extract_prediction(df, model_name):
    print(model_name)
    start = timeit.default_timer()
    extractor = KeywordExtractionPipeline(model="ml6team/" + model_name)
    df['en_' + model_name[10:]] = [extractor(x) for x in df['en_texts']]
    df['fr_' + model_name[10:]] = [extractor(x) for x in df['fr_texts']]
    stop = timeit.default_timer()
    print('Done Time: ', stop - start)
    return df

def generate_prediction(df, model_name):
    print(model_name)
    start = timeit.default_timer()
    extractor = KeywordGenerationPipeline(model="ml6team/" + model_name)
    df['en_' + model_name[10:]] = [extractor(x) for x in df['en_texts']]
    df['fr_' + model_name[10:]] = [extractor(x) for x in df['fr_texts']]
    
    stop = timeit.default_timer()
    print('Done Time: ', stop - start)
    return df

In [8]:
eat_results = extract_prediction(eat_data, 'keyphrase-extraction-kbir-inspec')
eat_results.to_csv('./results/eat_kw_prediction1.csv', index=False)
eat_results1 = extract_prediction(eat_results, 'keyphrase-extraction-distilbert-kptimes')
eat_results1.to_csv('./results/eat_kw_prediction2.csv', index=False)
eat_results2 = extract_prediction(eat_results1, 'keyphrase-extraction-kbir-kpcrowd')
eat_results2.to_csv('./results/eat_kw_prediction3.csv', index=False)

keyphrase-extraction-kbir-inspec
Done Time:  1991.3434752998874
keyphrase-extraction-distilbert-kptimes
Done Time:  286.52993946615607
keyphrase-extraction-kbir-kpcrowd
Done Time:  1026.1492735492066


In [9]:
set_results = extract_prediction(set_data, 'keyphrase-extraction-kbir-inspec')
set_results.to_csv('./results/set_kw_prediction1.csv', index=False)
set_results1 = extract_prediction(set_results, 'keyphrase-extraction-distilbert-kptimes')
set_results1.to_csv('./results/set_kw_prediction2.csv', index=False)
set_results2 = extract_prediction(set_results1, 'keyphrase-extraction-kbir-kpcrowd')
set_results2.to_csv('./results/set_kw_prediction3.csv', index=False)

keyphrase-extraction-kbir-inspec
Done Time:  320.7594917360693
keyphrase-extraction-distilbert-kptimes
Done Time:  91.66723519796506
keyphrase-extraction-kbir-kpcrowd
Done Time:  345.52236298425123


In [None]:
eat_results3 = generate_prediction(eat_results2, 'keyphrase-generation-t5-small-inspec')
eat_results3.to_csv('./results/eat_kw_prediction4.csv', index=False)
eat_results4 = generate_prediction(eat_results3, 'keyphrase-generation-t5-small-openkp')
eat_results4.to_csv('./results/eat_kw_prediction_final.csv', index=False)

keyphrase-generation-t5-small-inspec


In [None]:
set_results3 = generate_prediction(set_results2, 'keyphrase-generation-t5-small-inspec')
set_results3.to_csv('./results/set_kw_prediction4.csv', index=False)
set_results4 = generate_prediction(set_results3, 'keyphrase-generation-t5-small-openkp')
set_results4.to_csv('./results/set_kw_prediction_final.csv', index=False)

### 2. Evaluate the results

In [37]:
def compute_metrics(candidate_kw_col, true_kw_col):
    candidate_kw_list = []
    true_kw_list = []
    for x in candidate_kw_col:
        candidate_kw_list.extend(x)
    for x in true_kw_col:
        true_kw_list.extend(x.split(','))
        
    extracted_kw = set([item.strip().lower() for item in candidate_kw_list])
    true_kw = set([item.strip().lower() for item in true_kw_list])
    
    # print(extracted_kw)
    # print(true_kw)
    
    true_pos = extracted_kw.intersection(true_kw)
    recall = round(len(true_pos)*100/len(true_kw),2) if true_kw != 0 else 0
    precision = round(len(true_pos)*100/len(extracted_kw),2) if extracted_kw != 0 else 0
    fscore = round(2*(precision*recall)/(precision+recall),2) if precision + recall != 0 else 0
    print(str(len(extracted_kw))+ ' , ' + str(len(true_kw)) +' , ' + str(len(true_pos)) +' , ' + str(precision)+' , ' +  str(recall)+' , ' +  str(fscore))
    return precision, recall, fscore

In [43]:
compute_metrics(eat_results4['en_extraction-kbir-inspec'], eat_results4['en_keywords'])
compute_metrics(eat_results4['fr_extraction-kbir-inspec'], eat_results4['fr_keywords'])
compute_metrics(eat_results4['en_extraction-distilbert-kptimes'], eat_results4['en_keywords'])
compute_metrics(eat_results4['fr_extraction-distilbert-kptimes'], eat_results4['fr_keywords'])
compute_metrics(eat_results4['en_extraction-kbir-kpcrowd'], eat_results4['en_keywords'])
compute_metrics(eat_results4['fr_extraction-kbir-kpcrowd'], eat_results4['fr_keywords'])

4724 , 1362 , 381 , 8.07 , 27.97 , 12.53
2183 , 1340 , 175 , 8.02 , 13.06 , 9.94
233 , 1362 , 41 , 17.6 , 3.01 , 5.14
31 , 1340 , 3 , 9.68 , 0.22 , 0.43
9032 , 1362 , 491 , 5.44 , 36.05 , 9.45
8319 , 1340 , 354 , 4.26 , 26.42 , 7.34


(4.26, 26.42, 7.34)

In [44]:
compute_metrics(set_results4['en_extraction-kbir-inspec'], set_results4['en_keywords'])
compute_metrics(set_results4['fr_extraction-kbir-inspec'], set_results4['fr_keywords'])
compute_metrics(set_results4['en_extraction-distilbert-kptimes'], set_results4['en_keywords'])
compute_metrics(set_results4['fr_extraction-distilbert-kptimes'], set_results4['fr_keywords'])
compute_metrics(set_results4['en_extraction-kbir-kpcrowd'], set_results4['en_keywords'])
compute_metrics(set_results4['fr_extraction-kbir-kpcrowd'], set_results4['fr_keywords'])

2081 , 650 , 152 , 7.3 , 23.38 , 11.13
1266 , 638 , 61 , 4.82 , 9.56 , 6.41
173 , 650 , 17 , 9.83 , 2.62 , 4.14
15 , 638 , 1 , 6.67 , 0.16 , 0.31
4904 , 650 , 211 , 4.3 , 32.46 , 7.59
4855 , 638 , 138 , 2.84 , 21.63 , 5.02


(2.84, 21.63, 5.02)

In [53]:
def compute_abstractive_metrics(candidate_kw_col, true_kw_col):
    candidate_kw_list = []
    true_kw_list = []
    for x in candidate_kw_col:
        candidate_kw_list.extend(x[0])
    for x in true_kw_col:
        true_kw_list.extend(x.split(','))
        
    extracted_kw = set([item.strip().lower() for item in candidate_kw_list])
    true_kw = set([item.strip().lower() for item in true_kw_list])
    
    # print(extracted_kw)
    # print(true_kw)
    
    true_pos = extracted_kw.intersection(true_kw)
    recall = round(len(true_pos)*100/len(true_kw),2) if true_kw != 0 else 0
    precision = round(len(true_pos)*100/len(extracted_kw),2) if extracted_kw != 0 else 0
    fscore = round(2*(precision*recall)/(precision+recall),2) if precision + recall != 0 else 0
    print(str(len(extracted_kw))+ ' , ' + str(len(true_kw)) +' , ' + str(len(true_pos)) +' , ' + str(precision)+' , ' +  str(recall)+' , ' +  str(fscore))
    return precision, recall, fscore

In [None]:
compute_abstractive_metrics(eat_results4['fr_generation-t5-small-inspec'], eat_results4['fr_keywords'])
compute_abstractive_metrics(set_results4['fr_generation-t5-small-inspec'], set_results4['fr_keywords'])
compute_abstractive_metrics(eat_results4['fr_generation-t5-small-openkp'], eat_results4['fr_keywords'])
compute_abstractive_metrics(set_results4['fr_generation-t5-small-openkp'], set_results4['fr_keywords'])