In [1]:
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    Text2TextGenerationPipeline, # keybart
    AutoModelForSeq2SeqLM, # keybart
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np
import time
import tqdm

import warnings
warnings.filterwarnings("error")

  from .autonotebook import tqdm as notebook_tqdm


## Keyphrase Extraction

https://huggingface.co/ml6team/keyphrase-extraction-kbir-inspec

### kbir-inspec

In [3]:
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])

In [4]:
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
kbir_extractor = KeyphraseExtractionPipeline(model=model_name)

config.json: 100%|██████████| 855/855 [00:00<00:00, 1.17MB/s]
pytorch_model.bin: 100%|██████████| 1.42G/1.42G [00:55<00:00, 25.5MB/s]
tokenizer_config.json: 100%|██████████| 1.16k/1.16k [00:00<00:00, 1.45MB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 4.24MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.40MB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 6.63MB/s]
special_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 3.15MB/s]


### keybart

In [5]:
# https://arxiv.org/abs/2112.08547

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs
        )
        return [[keyphrase.strip() for keyphrase in result.get("generated_text").split(self.keyphrase_sep_token) if keyphrase != ""] for result in results]

In [6]:
model_name = "ml6team/keyphrase-generation-keybart-inspec"
keybart_generator = KeyphraseGenerationPipeline(model=model_name)

config.json: 100%|██████████| 1.68k/1.68k [00:00<00:00, 12.0MB/s]
pytorch_model.bin: 100%|██████████| 1.63G/1.63G [01:12<00:00, 22.5MB/s]
tokenizer_config.json: 100%|██████████| 353/353 [00:00<00:00, 312kB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 6.31MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.95MB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 8.41MB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 727kB/s]


In [7]:
def kbir_inspec(text):
    return kbir_extractor(text)

def keybart_inspec(text):
    return keybart_generator(text)[0]

In [8]:
text = """
This is a no holds-barred thrilling drama mixed with killing, mayhem and manipulation among working professionals.
This film sheds light on a man's downfall from the pinnacles of success into the depths of his damaged character.
His insecurities lead him into a series of troubled romantic relationships and eventually a web of events that include betrayal and murder.
""".replace("\n", " ")

t = time.time()
keyphrases_kbir = kbir_inspec(text)
print("kbir duration:", time.time() - t)

t = time.time()
keyphrases_keybart = keybart_inspec(text)
print("keybart duration:", time.time() - t)

kbir duration: 5.394212007522583
keybart duration: 2.6310577392578125


In [9]:
print("kbir", keyphrases_kbir)
print("keybart", keyphrases_keybart)

kbir ['betrayal' 'romantic relationships' 'working professionals']
keybart ['insecurities', 'troubled romantic relationships', 'betrayal', 'murder', 'professional', 'personal']


# Semantic Scholar

> Rate limit:    
> 1 request per second for the following endpoints:    
> /paper/batch   
> /paper/search   
> /recommendations   
> 10 requests / second for all other calls    

In [12]:
import requests
import os
import sys

In [14]:
def print_papers(papers):
    for idx, paper in enumerate(papers):
        print(f"{idx}  {paper['title']} {paper['url']}")


def get_papers(search_words, result_limit):
    query_words = '+'.join(search_words)
    url = f'http://api.semanticscholar.org/graph/v1/paper/search'
    rsp = requests.get(url,
                        headers={'X-API-KEY': os.getenv('S2_API_KEY')},
                        params={'query': query_words, 'limit':result_limit, 'fields':'title,authors,url'})
    rsp.raise_for_status()
    results = rsp.json()

    total = results["total"]
    if not total:
        raise 'No matches found. Please try another query.'
        sys.exit()
        
    print(f'Found {total} results. Showing up to {result_limit}.')
    return results['data']

result_limit = 10
get_papers(['Amazon', 'Transparency', 'Twitter', 'accountability',
       'artificial intelligence', 'ethical', 'explainability',
       'healthcare', 'history', 'interpretability', 'privacy',
       'transparency'], result_limit)

Found 2 results. Showing up to 10.


[{'paperId': '3602a1acbad352baafedaf8bea10675e9027d334',
  'url': 'https://www.semanticscholar.org/paper/3602a1acbad352baafedaf8bea10675e9027d334',
  'title': 'Decoding the Black Box: A Comprehensive Review of Explainable Artificial Intelligence',
  'authors': [{'authorId': '8003685', 'name': 'Ossama H. Embarak'}]},
 {'paperId': '7dfa7d32d8ffa777095e6aa56aa629bc80742dd1',
  'url': 'https://www.semanticscholar.org/paper/7dfa7d32d8ffa777095e6aa56aa629bc80742dd1',
  'title': 'Artificial Intelligence in Medicine: Revolutionizing Healthcare for Improved Patient Outcomes',
  'authors': [{'authorId': '38680767', 'name': 'Varshil Mehta'}]}]

## Together now

In [50]:
def text_recommendations(text, keyphrase_f):
    text = text.replace("\n", " ")
    keyphrases = keyphrase_f(text)
    print("Keyphrases:", keyphrases)
    papers = get_papers(keyphrases, result_limit=10)
    print_papers(papers)

In [52]:
# Paper: A Contextual Latent Space Model: Subsequence Modulation in Melodic Sequence
abstract = "Some generative models for sequences such as music and text allow us to edit only subsequences, given surrounding context sequences, which plays an important part in steering generation interactively. However, editing subsequences mainly involves randomly resampling subsequences from a possible generation space. We propose a contextual latent space model (CLSM) in order for users to be able to explore subsequence generation with a sense of direction in the generation space, e.g., interpolation, as well as exploring variations—semantically similar possible subsequences. A context-informed prior and decoder constitute the generative model of CLSM, and a context position-informed encoder is the inference model. In experiments, we use a monophonic symbolic music dataset, demonstrating that our contextual latent space is smoother in interpolation than baselines, and the quality of generated samples is superior to baseline models. The generation examples are available online."
t = time.time()
text_recommendations(abstract, keybart_inspec)
print(time.time() - t)

Keyphrases: ['generative models', 'sequences', 'music', 'context sequences', 'randomly resampling subsequences']
Found 9 results. Showing up to 10.
0  A Contextual Latent Space Model: Subsequence Modulation in Melodic Sequence https://www.semanticscholar.org/paper/36d1aeff3f1e57f2f2bf3cd4f596d7797862bc86
1  Music Generation using Deep Generative Modelling https://www.semanticscholar.org/paper/7547fa19612fa7371093df94fbd0d2108f0578b6
2  Predictive models for music https://www.semanticscholar.org/paper/de344703e7fc244a55715cd4c8e461c5262f3c8c
3  RESEARCH ARTICLE Predictive Models for Music https://www.semanticscholar.org/paper/d526dd4a1e9c258185a4593175c50555af224ecb
4  Learning to Surprise: A Composer-Audience Architecture https://www.semanticscholar.org/paper/414d6998a5e838acf3c30a183e99cf8031032a79
5  On the use of FastMap for Audio Retrieval and Browsing https://www.semanticscholar.org/paper/c9d0942b8aa3f30a625fec1bbce25b52f12cf3b7
6  Learning to Surprise https://www.semanticscholar.