In [None]:
from pathlib import Path

In [None]:
input_data_path = Path("../data")

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel(input_data_path / "input.xlsx")
# FOR DEV... SHOULD BE ABLE TO REMOVE THE LINE BELOW
df = df.sample(10000, random_state=42)
df.head(3)

In [None]:
text_corpus = df['texts'].values.tolist()

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def custom_corpus_tokenizer(text_corpus):
    for text in text_corpus:
        # print(text[:79])
        yield custom_text_tokenizer(text)

def custom_text_tokenizer(text):
    tokens = []
    for token in nlp(text):
        # print("token.text: ", token.text)
        # print("token.is_alpha: ", token.is_alpha)
        # print("token.is_stop: ", token.is_stop)
        # print("token.is_punct", token.is_punct)
        if (
            token.is_alpha
            and not (
                token.is_stop
                or token.is_punct
                or len(token.text) <= 1
            )
        ):
            # print("token.lemma_.lower(): ", token.lemma_.lower())
            tokens.append(token.lemma_.lower())
    return tokens

In [None]:
# tokened corpus is used twice, once to compile dictionary and again to compile vectorized_corpus... so can't use a generator here (Can potentially stream to disk, then load it in each of the two cases to save on RAM)
tokened_corpus = list(custom_corpus_tokenizer(text_corpus))

In [None]:
from gensim import corpora

In [None]:
dictionary = corpora.Dictionary(tokened_corpus)
dictionary.save('./tokened_corpus.dict')

In [None]:
print(dictionary)
# dictionary.token2id

In [None]:
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]

In [None]:
dictionary.filter_tokens(once_ids)

In [None]:
dictionary.compactify()

In [None]:
print(dictionary)

In [None]:
class VectorizedCorpus:
    def __iter__(self):
        for tokened_doc in tokened_corpus:
            yield dictionary.doc2bow(tokened_doc)

In [None]:
vectorized_corpus = list(VectorizedCorpus())

In [None]:
corpora.MmCorpus.serialize('./vectorized_corpus.mm', vectorized_corpus)

In [None]:
# Assuming that taking n-grams into account is not necessary for now

In [None]:
from gensim import models

tfidf = models.TfidfModel(vectorized_corpus)

In [None]:
tfidf_vectorized_corpus = list(tfidf[vectorized_corpus])

In [None]:
# TODO: Potentially ommit tokens with scores below a determined threshold

In [None]:
tfidf_vectorized_corpus

In [None]:
lsi = models.LsiModel(vectorized_corpus, id2word=dictionary, num_topics=50)

In [None]:
lsi_vectorized_corpus = list(lsi[vectorized_corpus])

In [None]:
# Using cosine similarity for now, but consider Jensen-Shannon

# Also compare lsi/lda/tfidf

#https://www.kaggle.com/ktattan/lda-and-document-similarity

In [None]:
from gensim import similarities

In [None]:
lsi_similarity_index = similarities.MatrixSimilarity(lsi[lsi_vectorized_corpus])

In [None]:
lsi_similarity_index.save('./lsi_similarity.index')
# lsi_similarity_index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

In [None]:
def print_top3_most_similar(example_doc):
    vec_bow = dictionary.doc2bow(custom_text_tokenizer(example_doc))
    vec_lsi = lsi[vec_bow]
    sims = sorted(enumerate(lsi_similarity_index[vec_lsi]), key=lambda item: -item[1])
    for doc_position, doc_score in sims[:3]:
        print(doc_score, text_corpus[doc_position])

In [None]:
# Example doc to compare against lsi similarity index
example_doc_1 = "The company’s cloud services business combined with a surge in Prime subscriptions to increase revenue 31% year on year  This article is more than 3 years old  This article is more than 3 years old  Amazon Web Services, the company’s cloud service division, has long provided the infrastructure for vast retail websites and plucky startups alike, from Netflix and Airbnb to Nasa and the Royal Opera House, but is now seen as the company’s biggest driver of growth.  AWS combined with enthusiastic take-up of its premium Prime service to generate better than expected revenue for the second quarter of the year.  Total revenue reached $30.4bn, up 31% from the same period in 2015 and higher than analysts’ expectations of $29.55bn, while AWS surged 58.2% to $2.89bn – slightly higher than the estimate of $2.83bn predicted by market research firm FactSet StreetAccount.  Prime offers free shipping on products from the site as well as exclusive film and TV content, advertising-free content and unlimited photo storage. A dedicated promotional Prime Day on 12 July is also expected to help drive sales of between $31.0bn and $33.5bn for the current quarter.  The company’s net sales rose 31.1% to $30.40bn in the second quarter ending 30 June. Sales in North America, its biggest market, jumped 28.1% to $17.67bn.    Why so much coverage of Amazon Prime Day. The incentives, of course Read more  Amazon also saw its net profit reach a record high of $857m, continuing a relatively new strategy of recording profit rather than reinvesting in its business – though it last quarter committed to investing $5bn into its business in India.  It’s been a busy few months for Amazon around the world, and particularly in India, said the CEO, Jeff Bezos, in a statement.  We launched a new AWS region, introduced Prime with unlimited free shipping, and announced that Prime Video is coming soon, offering Prime members in India exclusive access to Amazon Original Series and Movies – including original content featuring top Indian creators and talent.  The world’s biggest online retailer’s shares were up 2% in after-hours trading on Thursday."
print_top3_most_similar(example_doc_1)

In [None]:
example_doc_2 = " The Amazon makes up almost a third of all tropical rainforests left on Earth, and is a vital carbon sink Three hundred million people worldwide live in forests and 1.6 billion depend on them for their livelihoods.Forests provide habitat for a vast array of plants and animals, many of which are still undiscovered.These ecosystems are so much more than a collection of trees, they are home to 80 per cent of the world’s terrestrial biodiversity.The Amazon Rainforest makes up nearly a third of all the tropical rainforests left on Earth and it plays a vital role in sustaining life on the planet to help stabilise the climate.Yet, this vast, tropical wilderness and its inhabitants are losing the fight for survival. The Amazon’s 2020 fire season is imminent and set to be at least as bad as last year.This follows continued deforestation and illegal invaders who take advantage of reduced law enforcement to snatch indigenous and protected lands, destroying this precious and irreplaceable natural habitat"
print_top3_most_similar(example_doc_2)