In [None]:
import random
import gensim
import string
from itertools import groupby
import re
from nltk.stem.porter import PorterStemmer
random.seed(123)
stemmer = PorterStemmer()

In [None]:
with open("pg3300.txt", "r", encoding="utf-8") as file:
    fileString = file.read()

In [None]:
def paragraph(lines):
    for group_separator, line_iteration in groupby(lines.splitlines(True), key=str.isspace):
        if not group_separator:
            yield ''.join(line_iteration)

In [None]:
def make_paragraphs(file, filter_word):
    paragraph_list = []
    for p in paragraph(file):
        if filter_word.casefold() not in p.casefold():
            paragraph_list.append(p)
    return paragraph_list

In [None]:
def tokenize_document(documents):
    tokenized_documents = []
    for d in documents:
        # Also the punctuation is removed
        tokenized_documents.append(re.sub("[^\w]", " ", d).split())
    return tokenized_documents

In [None]:
def stem(document):
    stemmed_document = []
    for d in document:
        words_stemmed = []
        for word in d:
            words_stemmed.append(stemmer.stem(word).lower())
        stemmed_document.append(words_stemmed)
    return stemmed_document

In [None]:
documents = make_paragraphs(fileString, "Gutenberg")
# Copy of the original document
documents_edited = documents.copy()
documents_edited = tokenize_document(documents_edited)
documents_edited = stem(documents_edited)

In [None]:
stopString = 'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,' \
             'cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,' \
             'how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,' \
             'not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,' \
             'their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,' \
             'who,whom,why,will,with,would,yet,you,your'
stop_word_list = stopString.split(',')
dictionary = gensim.corpora.Dictionary(documents_edited)

In [None]:
def stop_word_ids(stop_words, dictionary):
    ids = []
    for word in stop_words:
        try:
            ids.append(dictionary.token2id[word])
        except:
            pass
    return ids

In [None]:
bags=[]
# list of the id's
stop_ids = stop_word_ids(stop_word_list, dictionary)

# filter out the stopwords in the dictionary
dictionary.filter_tokens(stop_ids)

for p in documents_edited:
    bags.append(dictionary.doc2bow(p))

Task 3

In [None]:
# 3.1
tfidf_model = gensim.models.TfidfModel(bags)
# 3.2
tfidf_corpus = tfidf_model[bags]
# 3.3
matrix_sim = gensim.similarities.MatrixSimilarity(tfidf_corpus)
# 3.4
lsi_model = gensim.models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=100)
lsi_corpus = lsi_model[bags]
lsi_matrix = gensim.similarities.MatrixSimilarity(lsi_corpus)
# 3.5
print("First 3 LSI topics")
topics = lsi_model.show_topics(3)
for topic in topics:
    print(topic)



First 3 LSI topics
(0, '0.146*"labour" + 0.137*"price" + 0.127*"produc" + 0.127*"employ" + 0.122*"capit" + 0.121*"tax" + 0.121*"countri" + 0.118*"trade" + 0.118*"hi" + 0.115*"land"')
(1, '-0.258*"rent" + -0.231*"labour" + -0.207*"land" + 0.205*"silver" + 0.191*"gold" + -0.176*"profit" + -0.174*"stock" + -0.161*"employ" + -0.155*"capit" + 0.152*"coin"')
(2, '0.352*"price" + 0.227*"silver" + -0.211*"trade" + 0.199*"quantiti" + -0.167*"coloni" + 0.163*"labour" + 0.152*"valu" + 0.150*"gold" + -0.137*"capit" + 0.133*"corn"')


Task 4

In [None]:
def remove_punctuations_list(word_list):
    words = []
    for word in word_list:
        w = ""
        for char in word:
            if (string.punctuation + "\n\r\t").__contains__(char):
                if w != "":
                    words.append(w.lower())
                    w = ""
                continue
            w += char
        if w != "":
            words.append(w)
    return words

In [None]:
def stem_list(words):
    for i, word in enumerate(words):
        words[i] = stemmer.stem(word.lower())
    return words

In [None]:
def preprocessing(query):
    query = query.lower()
    query = query.split()
    query = remove_punctuations_list(query)
    query = stem_list(query)
    return query

In [None]:
# 4.1
query = "What is the function of money?"
query = preprocessing(query)
query = dictionary.doc2bow(query)

In [None]:
# 4.2
tfidf_index = tfidf_model[query]
print("\nTF_IDF Weights")
for word in tfidf_index:
    word_index = word[0]
    word_weight = word[1]
    print("index", word_index, ", word:", dictionary.get(word_index, word_weight), ", weight:", word_weight)



TF_IDF Weights
index 52 , word: money , weight: 0.3126887267826082
index 1153 , word: function , weight: 0.9498556522667386


In [None]:
# 4.3
print("\n Top 3 Relevant Documents", end="")
# similar documents
doc2sim = enumerate(matrix_sim[tfidf_index])
# sorting
top_results = sorted(doc2sim, key=lambda x: x[1], reverse=True)[:3]
# printing top 3 most relevant documents
for result in top_results:
    doc = documents[result[0]]
    doc = doc.split('\n')
    print("\n[Paragraph %d]" % result[0])
    # printing only 5 lines of the document
    for line in range(4):
      try:
        print(doc[line])
      except:
        pass



 Top 3 Relevant Documents
[Paragraph 682]
      The general stock of any country or society is the same with that of all
      its inhabitants or members; and, therefore, naturally divides itself into
      the same three portions, each of which has a distinct function or office.


[Paragraph 993]
      That wealth consists in money, or in gold and silver, is a popular notion
      which naturally arises from the double function of money, as the
      instrument of commerce, and as the measure of value. In consequence of its
      being the instrument of commerce, when we have money we can more readily

[Paragraph 817]
      Whatever part of his stock a man employs as a capital, he always expects
      it to be replaced to him with a profit. He employs it, therefore, in
      maintaining productive hands only; and after having served in the function
      of a capital to him, it constitutes a revenue to them. Whenever he employs


In [None]:
#4.4
print("\n[4.4.1 - Top 3 Topics with the most Significant Weights]",end="")
lsi_query = lsi_model[query]
topics = sorted(lsi_query, key=lambda kv: -abs(kv[1]))[:3]
for topic in enumerate(topics):
    t = topic[1][0]
    print("\n[Topic %d]" % t)
    print(lsi_model.show_topics()[t])

print("\n[4.4.2 - Top 3 Most Relevant Paragraphs]", end="")
lsi_doc2sim = enumerate(lsi_matrix[lsi_query])
lsi_documents = sorted(lsi_doc2sim, key=lambda kv: -abs(kv[1]))[:3]
for result in lsi_documents:
    doc = documents[result[0]]
    doc = doc.split('\n')
    print("\n[Paragraph %d]" %result[0])
    for line in range(5):
        print(doc[line])



[4.4.1 - Top 3 Topics with the most Significant Weights]
[Topic 4]
(4, '0.262*"bank" + 0.212*"circul" + -0.212*"price" + 0.181*"money" + 0.174*"capit" + -0.170*"corn" + 0.168*"gold" + -0.160*"import" + -0.160*"export" + 0.136*"coin"')

[Topic 12]
(12, '-0.353*"bank" + 0.209*"coin" + -0.184*"money" + -0.175*"tax" + -0.164*"commod" + 0.156*"profit" + -0.148*"paper" + 0.144*"duti" + 0.139*"silver" + 0.134*"gold"')

[Topic 16]
(16, '0.303*"coloni" + 0.263*"circul" + -0.190*"increas" + 0.178*"price" + -0.149*"coin" + -0.146*"cent" + 0.143*"money" + -0.143*"per" + 0.134*"work" + 0.130*"materi"')

[4.4.2 - Top 3 Most Relevant Paragraphs]
[Paragraph 993]
      That wealth consists in money, or in gold and silver, is a popular notion
      which naturally arises from the double function of money, as the
      instrument of commerce, and as the measure of value. In consequence of its
      being the instrument of commerce, when we have money we can more readily
      obtain whatever else we hav