In [1]:
import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [2]:
doc = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """


In [4]:
n_gram_range = (3,3) # trigrams
stop_words = "english"

# Extract candidate words/phrases
cv = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = cv.get_feature_names_out()

print(candidates)
print(len(candidates))

['algorithm analyzes training' 'algorithm correctly determine'
 'algorithm generalize training' 'allow algorithm correctly'
 'analyzes training data' 'based example input'
 'called supervisory signal' 'class labels unseen'
 'consisting input object' 'consisting set training'
 'correctly determine class' 'data consisting set'
 'data produces inferred' 'data unseen situations' 'desired output value'
 'determine class labels' 'example input output' 'example pair consisting'
 'examples optimal scenario' 'examples supervised learning'
 'function labeled training' 'function maps input' 'function used mapping'
 'generalize training data' 'inferred function used'
 'infers function labeled' 'input object typically' 'input output based'
 'input output pairs' 'instances requires learning'
 'labeled training data' 'labels unseen instances'
 'learning algorithm analyzes' 'learning algorithm generalize'
 'learning example pair' 'learning function maps'
 'learning machine learning' 'learning task lea

In [6]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [7]:
# pick top_n words closest to the doc.
top_n = 5

distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(keywords)

['algorithm analyzes training', 'learning algorithm generalize', 'learning machine learning', 'learning algorithm analyzes', 'algorithm generalize training']
