In [None]:
from gensim.corpora import Dictionary

dictionary = Dictionary.load('./tokened_corpus.dict')

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def custom_text_tokenizer(text):
    tokens = []
    for token in nlp(text):
        # print("token.text: ", token.text)
        # print("token.is_alpha: ", token.is_alpha)
        # print("token.is_stop: ", token.is_stop)
        # print("token.is_punct", token.is_punct)
        if (
            token.is_alpha
            and not (
                token.is_stop
                or token.is_punct
                or len(token.text) <= 1
            )
        ):
            # print("token.lemma_.lower(): ", token.lemma_.lower())
            tokens.append(token.lemma_.lower())
    return tokens

In [None]:
from gensim.corpora.mmcorpus import MmCorpus
from gensim.test.utils import datapath

vectorized_corpus = MmCorpus('./vectorized_corpus.mm')

In [None]:
from gensim import models

lsi = models.LsiModel(vectorized_corpus, id2word=dictionary, num_topics=50)

In [None]:
from gensim import similarities

lsi_similarity_index = similarities.MatrixSimilarity.load('./lsi_similarity.index')

In [None]:
import json


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def json_load(filename):
    with open(filename) as f:
        return json.load(f)

    
text_corpus = json_load('./text_corpus.json')

In [None]:
def print_top3_most_similar(example_doc):
    vec_bow = dictionary.doc2bow(custom_text_tokenizer(example_doc))
    vec_lsi = lsi[vec_bow]
    sims = sorted(enumerate(lsi_similarity_index[vec_lsi]), key=lambda item: -item[1])
    for doc_position, doc_score in sims[:3]:
        print(doc_score, text_corpus[doc_position])

In [None]:
# Example doc to compare against lsi similarity index
example_doc_1 = "Amazon Web Services"
print_top3_most_similar(example_doc_1)