# 具体实现

In [1]:
doc = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

## text preprocess

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (3, 3) # n_gram =1可以查看单个关键词
stop_words = "english" # if try logner keywords, do not remove stop_words

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names() #candidaet keylwords list

In [42]:
candidates

['algorithm analyzes training',
 'algorithm correctly determine',
 'algorithm generalize training',
 'allow algorithm correctly',
 'analyzes training data',
 'based example input',
 'called supervisory signal',
 'class labels unseen',
 'consisting input object',
 'consisting set training',
 'correctly determine class',
 'data consisting set',
 'data produces inferred',
 'data unseen situations',
 'desired output value',
 'determine class labels',
 'example input output',
 'example pair consisting',
 'examples optimal scenario',
 'examples supervised learning',
 'function labeled training',
 'function maps input',
 'function used mapping',
 'generalize training data',
 'inferred function used',
 'infers function labeled',
 'input object typically',
 'input output based',
 'input output pairs',
 'instances requires learning',
 'labeled training data',
 'labels unseen instances',
 'learning algorithm analyzes',
 'learning algorithm generalize',
 'learning example pair',
 'learning functio

## Bert Embeddings

In [44]:
# use sentence-transformers package
# embed document + candidate keywords list

from sentence_transformers import SentenceTransformer

# distilbert — base-nli-stsb-mean-tokens or xlm-r-distilroberta-base-paraphase-v1 
# show high performance in semantic similarity and paraphrase identification 
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

## Cosine Similarity

In [45]:
# compare cosine similary between doc words embedding and candidates
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
# top 5 most similar candidates to input document
keywords

['learning algorithm generalize',
 'algorithm analyzes training',
 'learning algorithm analyzes',
 'learning machine learning',
 'algorithm generalize training']

## Diversification
- all keyphrases are so similar to each other -> diversifcation
- a balance between the accuracy of keywords/keyphrases and the diversity
- Two algorithms
    - Max Sum Similarity
    - Maximal Marginal Relevance

### Max Sum Similarity

In [24]:
# Select the top 20 keywords/keyphrases
# from those 20, select the 5 that are the least similar to each other
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [48]:
max_sum_sim(doc_embedding, candidate_embeddings,candidates,top_n = 5,nr_candidates = 10)

['signal supervised learning',
 'requires learning algorithm',
 'learning function maps',
 'algorithm analyzes training',
 'learning machine learning']

In [47]:
max_sum_sim(doc_embedding, candidate_embeddings,candidates,top_n = 5,nr_candidates = 20)

['set training examples',
 'generalize training data',
 'requires learning algorithm',
 'supervised learning algorithm',
 'learning machine learning']

**Accuracy vs. diversity Trade-off**
- increase the nr_candidates, then there is a good chance you get very diverse keywords but that are not very good representations of the document
- keep nr_candidates less than 20% of the total number of unique words

### Maximal Marginal Relevance
minimize redundancy and maximize the diversity of results in text summarization tasks

In [49]:
import numpy as np

def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [50]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n = 5, diversity = 0.2)

['algorithm generalize training',
 'learning machine learning',
 'learning algorithm analyzes',
 'supervised learning algorithm',
 'algorithm analyzes training']

In [51]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n = 5, diversity = 0.7)

['algorithm generalize training',
 'labels unseen instances',
 'new examples optimal',
 'determine class labels',
 'supervised learning algorithm']

# keybert package
https://github.com/MaartenGr/KeyBERT/

In [52]:
from keybert import KeyBERT

doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs.[1] It infers a
         function from labeled training data consisting of a set of training examples.[2]
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal). 
         A supervised learning algorithm analyzes the training data and produces an inferred function, 
         which can be used for mapping new examples. An optimal scenario will allow for the 
         algorithm to correctly determine the class labels for unseen instances. This requires 
         the learning algorithm to generalize from the training data to unseen situations in a 
         'reasonable' way (see inductive bias).
      """
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc)

### Extract Keywords

In [53]:
# single keyword
model.extract_keywords(doc, keyphrase_length=1, stop_words='english')

['learning', 'training', 'algorithm', 'class', 'mapping']

In [54]:
# n-gram
model.extract_keywords(doc, keyphrase_length=2, stop_words=None)

['learning algorithm',
 'machine learning',
 'supervised learning',
 'learning function',
 'algorithm analyzes']

### Diversification

In [56]:
# Max Sum Similarity
model.extract_keywords(doc, keyphrase_length=3, stop_words='english', 
                           use_maxsum=True, nr_candidates=20, top_n=5)

['set training examples',
 'generalize training data',
 'requires learning algorithm',
 'supervised learning algorithm',
 'learning machine learning']

In [57]:
# Maximal Marginal Relevance
model.extract_keywords(doc, keyphrase_length=3, stop_words='english', use_mmr=True, diversity=0.7)

['algorithm generalize training',
 'labels unseen instances',
 'new examples optimal',
 'determine class labels',
 'supervised learning algorithm']