## Problem: By removing stopwords, what KeyBERT finds is not a suphrase of the text.

In [None]:
text = """This seminar will be based on two novels by the Argentine writer Roberto Arlt: "The Rabid Toy" (1926) and "The Seven Fools" (1929) and will be devoted to researching these novels under aspects taken from cultural and literary studies. This is how the question of the socio-historical and cultural context of Argentina at the beginning of the 20th century will come into play, as well as of social and cultural changes or scenes of individual identity crisis in a society that is increasingly directed and driven by money. , for constant failures, unstoppable progress but without prospects for the individual. It will try to investigate how Arlt's texts collect these themes and how he represents them, that is, the central method will be discourse analysis and the starting point will be the question of how to represent and capture the profound changes of that time in literature. essential attendance: 1) reading of the texts until the first class 2) regular and active attendance 3) organization of a class in groups (2-3 people, 60 min. in total) & written work of 12-15 pages The seminar will take place in Spanish (with the possibility of clarifying some points in German if necessary). Recommended editions: Arlt, Roberto: The rabid toy, ed. by Rita Gnutzmann. Madrid: Cátedra Letras Hispánica 2011 (sixth edition). [ISBN: 978-84-376-0511-1] - The seven madmen, ed. by Flora Guzmán. Madrid: Cátedra Letras Hispánica 2011 (fifth edition). [ISBN: 978-84-376-1119-8]"""
text

In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(text.lower(), keyphrase_ngram_range=(1, 3), stop_words=stopwords.words("english"))

In [None]:
keywords

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
stopwords.words("english")[:10]

In [None]:
indwords = [(ind,word) for ind,word in enumerate(word_tokenize(text)) if word not in stopwords.words("english")]
inds, words = list(zip(*indwords))
assert not any(" " in i for i in words)
words[:10]

In [None]:
withoutstops = " ".join(words).lower()

In [None]:
start_positions = [((start := withoutstops.find(i)), start+len(i)) for i in [j[0] for j in keywords]]
start_positions

In [None]:
[withoutstops[i:j] for i,j in start_positions]

In [None]:
start_indices_withoutstops = [withoutstops[:i].count(" ") for i,j in start_positions]
start_indices_withoutstops

In [None]:
actual_keyphrases = []
for start_ind, (startpos, stoppos) in zip(start_indices_withoutstops, start_positions):
    full_phrase = withoutstops[startpos:stoppos]
    last_word = full_phrase.split(" ")[-1]
    from_start = word_tokenize(text.lower())[inds[start_ind]:]
    actual_keyphrases.append(" ".join(from_start[:from_start.index(last_word)+1]))

actual_keyphrases

In [None]:
[i in text.lower() for i in actual_keyphrases]

# One function out of that:

In [None]:
text = """This seminar will be based on two novels by the Argentine writer Roberto Arlt: "The Rabid Toy" (1926) and "The Seven Fools" (1929) and will be devoted to researching these novels under aspects taken from cultural and literary studies. This is how the question of the socio-historical and cultural context of Argentina at the beginning of the 20th century will come into play, as well as of social and cultural changes or scenes of individual identity crisis in a society that is increasingly directed and driven by money. , for constant failures, unstoppable progress but without prospects for the individual. It will try to investigate how Arlt's texts collect these themes and how he represents them, that is, the central method will be discourse analysis and the starting point will be the question of how to represent and capture the profound changes of that time in literature. essential attendance: 1) reading of the texts until the first class 2) regular and active attendance 3) organization of a class in groups (2-3 people, 60 min. in total) & written work of 12-15 pages The seminar will take place in Spanish (with the possibility of clarifying some points in German if necessary). Recommended editions: Arlt, Roberto: The rabid toy, ed. by Rita Gnutzmann. Madrid: Cátedra Letras Hispánica 2011 (sixth edition). [ISBN: 978-84-376-0511-1] - The seven madmen, ed. by Flora Guzmán. Madrid: Cátedra Letras Hispánica 2011 (fifth edition). [ISBN: 978-84-376-1119-8]"""
text

In [None]:
from keybert import KeyBERT
from nltk.corpus import stopwords as nlstopwords
from nltk.tokenize import word_tokenize

kw_model = KeyBERT()
stopwordlanguages = {"en": "english", "de": "german"}
lang = "en"

def extract(kw_model, text, lang="en"):
    assert lang in stopwordlanguages
    stopwords = nlstopwords.words(stopwordlanguages[lang])

    candidates = set()
    for nwords in range(1, 4):
        n_candidates = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, nwords), stop_words=stopwords)
        candidates |= set(i[0] for i in n_candidates)
    candidates = list(candidates)

    indwords = [(ind,word) for ind,word in enumerate(word_tokenize(text)) if word not in stopwords]
    inds, words = list(zip(*indwords))
    assert not any(" " in i for i in words)
    withoutstops = " ".join(words).lower()
    start_positions = [((start := withoutstops.find(i)), start+len(i)) for i in candidates]
    start_indices_withoutstops = [withoutstops[:i].count(" ") for i,j in start_positions]
    actual_keyphrases = []
    for start_ind, (startpos, stoppos) in zip(start_indices_withoutstops, start_positions):
        full_phrase = withoutstops[startpos:stoppos]
        last_word = full_phrase.split(" ")[-1]
        from_start = word_tokenize(text.lower())[inds[start_ind]:]
        actual_keyphrases.append(" ".join(from_start[:from_start.index(last_word)+1]))
    return actual_keyphrases


In [None]:
keyphrases = extract(kw_model, text, lang)
keyphrases

In [None]:
all(i in text.lower() for i in keyphrases)