In [None]:
import random
import codecs
import os
import nltk
import gensim
from nltk.probability import FreqDist
from gensim import models
from gensim import similarities

# 1.0 - random number generator
random.seed(123)

## Part 1: Data loading and preprocessing

Partition document: This function partitions the document by paragraphs into seperate documents. 

In [None]:
# ---- Partition document collection
def partition(input_file):
    # 1.1 - open and load utf-8 encoded file
    with codecs.open(input_file, "r", "utf-8") as file:
        lines = file.readlines()

        chunks = []
        paragraph = []
        # 1.2 - Partition file into seperate paragraphs. Each paragraph will be a seperate document.
        for line in lines:
            line = line.strip()
            if line:  # if the line is not empty, add to current paragraph
                paragraph.append(line)
            else:  # if the line is empty and there's an existing paragraph, end the current paragraph
                if paragraph:
                    chunks.append(' '.join(paragraph))
                    paragraph = []
        # Add the last paragraph if the file doesn't end with an empty line
        if paragraph:
            chunks.append(' '.join(paragraph))

    for index, paragraph in enumerate(chunks, start=1):
        output_filename = f"paragraph_{index}.txt"
        with codecs.open(output_filename, 'w', "utf-8") as out_file:
            out_file.write(paragraph)

    print(f"Partitioned {len(chunks)} into seperate files.")


def preprocess_collection(target_word, directory_path):
    files_in_directory = os.listdir()
    collection = [f for f in files_in_directory if
                  os.path.isfile(os.path.join(directory_path, f)) and f.startswith("paragraph_") and f.endswith(
                      ".txt")]

    # 1.3 - Remove paragraph (document) if it contains target word
    for file in collection:
        with open(os.path.join(directory_path, file), 'r', encoding='utf-8') as f:
            content = f.read()
            if target_word in content:
                os.remove(os.path.join(directory_path, file))

    return collection







Preprocess collection: This function removes all paragraphs containing the word "Gutenberg".

In [None]:
def preprocess_collection(target_word, directory_path):
    files_in_directory = os.listdir()
    collection = [f for f in files_in_directory if
                  os.path.isfile(os.path.join(directory_path, f)) and f.startswith("paragraph_") and f.endswith(
                      ".txt")]

    # 1.3 - Remove paragraph (document) if it contains target word
    for file in collection:
        with open(os.path.join(directory_path, file), 'r', encoding='utf-8') as f:
            content = f.read()
            if target_word in content:
                os.remove(os.path.join(directory_path, file))

    return collection

Tokenize: One function accepts a collection as parameter, and iterates through the collection by calling the function tokenize_doc, which accepts a document as a parameter.

In [None]:
# ---- Tokenize document collection
def tokenize(collection):
    processed_files = []

    # Initialize FreqDist
    fdist = FreqDist()

    # Read files
    for file in collection:
        tokenized_file = tokenize_doc(file)

        # 1.7 - Add processed tokens to list and update fdist
        fdist.update(tokenized_file)
        processed_files.append(tokenized_file)

    return processed_files

def tokenize_doc(file):
    # 4.1 - Apply transformations to query
    punctuation = [',', '.', ';', ':', '?', '!', '(', ')', '[', ']', '{', '}', '"', "'", "’"]

    with open(file, 'r', encoding='utf-8') as f:
        # 1.5 - Convert to lower case
        content = f.read().lower()

        # 1.4 - Tokenize words
        tokens = nltk.word_tokenize(content)

        # 1.6 - Stem tokens and remove punctuation (1.5)
        stemmer = nltk.stem.PorterStemmer()
        stemmed_file = [stemmer.stem(word) for word in tokens if
                        word not in punctuation]
        tokenized_doc = stemmed_file

        return tokenized_doc

## Part 2: Dictionary building

The following three functions are used to build a dictionary based on the processed files and stopwords, which in turn is used to create a bag-of-words. 

In [None]:
def build_dictionary(processed_files):
    # 2.1 - Build the dictionary
    dictionary = gensim.corpora.Dictionary(processed_files)

    # Retrieve and filter stopwords
    stopwords = retrieve_stopwords()
    stop_ids = []
    removed = 0
    for word in stopwords:
        try:
            stop_id = dictionary.token2id[word]
            stop_ids.append(stop_id)
            removed += 1
        except:
            pass
    dictionary.filter_tokens(stop_ids)

    return dictionary

def to_bow(processed_files, dictionary):
    # 2.2 - Map paragraphs into Bags-of-Words
    bow_corpus = [dictionary.doc2bow(token, allow_update=True) for token in processed_files]
    return bow_corpus


def retrieve_stopwords():
    # File downloaded from TextFixer: https://www.textfixer.com/tutorials/common-english-words.php
    file = open('./common-english-words.txt', 'r')
    stopwords = []
    for rows in file:
        row = rows.rstrip().split(',')
        stopwords += row
    return stopwords


## Part 3: Retrieval models

The similarity function below creates the TF-IDF model from the bag-of-words corpus, which is used to initialize the LSI model.

In [None]:
def similarity(query, collection):
    # 2.1 - Build dictionary
    dictionary = build_dictionary(collection)

    # 2.2 - Map paragraphs to Bag-of-Words
    corpus = to_bow(collection, dictionary)

    # -- TD-IDF conversion --
    # 3.1 - Initialize TD-IDF model using Bag-of-Words
    tfidf_model = gensim.models.TfidfModel(corpus, normalize=True)
    # 3.2 - Map bow into TF-IDF weights
    tfidf_corpus = tfidf_model[corpus]

    # --- LSI model ---
    # 3.4 - Initialize LSI model using the TD-IDF corpus
    lsi_model = gensim.models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=100)  # Create LSI Model
    lsi_corpus = lsi_model[tfidf_corpus]

    # ---- Model similarity ---
    # 3.3 - Construct MatrixSimilarity object of the LSI corpus
    index = gensim.similarities.MatrixSimilarity(lsi_corpus)

    # 3.5 - Printing first 3 LSI topics
    result = lsi_model.show_topics(3)
    print(f"LSI results: {result}")
    
    # 4.2 - Convert the query to LSI space
    query_vec = dictionary.doc2bow(query)
    lsi_vec = lsi_model[query_vec]

    # Perform similarity query against the corpus
    sims = index[lsi_vec]
    sims = sorted(enumerate(sims), key=lambda kv: -kv[1])[:3]

    # 4.3 - Report top 3 most relevant paragraphs for the query
    count = 0
    for doc_position, doc_score in sims:
        paragraph = retrieve_paragraph(doc_position)
        print(f"[paragraph {doc_position}] \n {paragraph} \n ")
        count += 1
        if count == 3:
            break
    
def retrieve_paragraph(doc_position):
    """
    Retrieves the original, unprocessed paragraph.
    """
    directory_path = "./"
    filename = f"paragraph_{doc_position}.txt"
    filepath = os.path.join(directory_path, filename)

    if os.path.exists(filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = ""
            for _ in range(5):
                line = f.readline()
                if not line:
                    break
                content += line
            return content
    return None


#### 3.5 - Results from printing the first 3 LSI topics:

LSI results: 
[
    (0, '0.116*"labour" + 0.109*"price" + 0.105*"their" + 0.103*"is" + 0.102*"employ" + 0.101*"hi" + 0.101*"produc" + 0.101*"countri" + 0.100*"it" + 0.100*"a"'), 
    (1, '-0.287*"labour" + -0.203*"rent" + -0.187*"stock" + -0.186*"land" + -0.180*"employ" + -0.178*"profit" + -0.160*"wage" + -0.155*"capit" + -0.153*"produc" + 0.145*"coloni"'), 
    (2, '0.342*"price" + 0.276*"silver" + 0.203*"quantiti" + 0.201*"gold" + 0.173*"coin" + 0.161*"valu" + 0.149*"corn" + 0.144*"money" + -0.143*"trade" + -0.140*"capit"')
]

From the results above, we can deduct the following information:

    Topic 0: Focuses mostly on labor, price, and country-related aspects. Words like "labour," "price," and "countri" have relatively high weights, suggesting that these are important concepts within this topic.
    
    Topic 1: Seems to concentrate on aspects of labor, but also includes terms related to rent, stock, land, and profits. The term "coloni" has a negative weight, suggesting that it is not aligned with the other terms in this topic or possibly that it is more related to another topic.

    Topic 2: Appears to be about currency and commodities, with "price," "silver," "gold," and "coin" being heavily weighted terms. The negative weights for "trade" and "capit" (capital) suggest these terms are less relevant to this particular topic.

This indicates that the numbers before each word (e.g., 0.116*"labour") are weights that signify the importance of each word in defining the respective topic. A higher absolute value of the weight indicates higher relevance to the topic. Negative values often indicate that the word is relevant but in a different context or opposite sense in comparison to other words in the topic.

# ----- PART 4: Querying ------

In [None]:


def preprocess_query(query):
    """
    Converts a query to a file, and tokenizes file.
    """
    # 4.1 - Preprocess query to remove stopwords, punctuation, tokenize and stem
    filename = query_to_file(query)
    directory_path = "./"
    if os.path.isfile(os.path.join(directory_path, filename)):
        processed_query = filename
    else:
        processed_query = ''

    return processed_query


def query_to_file(query):
    """
    Writes query to a file and returns the filename query.txt
    """
    filename = "query.txt"
    mode = "w" if os.path.exists(filename) else "x"

    with open(filename, mode) as query_file:
        query_file.write(query)

    return filename


#### 4.3 - Results from top 3 the most relevant paragraphs for the query "What is the function of money?"

[paragraph 248] 
 That wealth consists in money, or in gold and silver, is a popular notion which naturally arises from the double function of money, as the instrument of commerce, and as the measure of value. In consequence of its being the instrument of commerce, when we have money we can more readily obtain whatever else we have occasion for, than by means of any other commodity. The great affair, we always find, is to get money. When that is obtained, there is no difficulty in making any subsequent purchase. In consequence of its being the measure of value, we estimate that of all other commodities by the quantity of money which they will exchange for. We say of a rich man, that he is worth a great deal, and of a poor man, that he is worth very little money. A frugal man, or a man eager to be rich, is said to love money; and a careless, a generous, or a profuse man, is said to be indifferent about it. To grow rich is to get money; and wealth and money, in short, are, in common language, considered as in every respect synonymous. 
 
[paragraph 807] 
 It would be too ridiculous to go about seriously to prove, that wealth does not consist in money, or in gold and silver; but in what money purchases, and is valuable only for purchasing. Money, no doubt, makes always a part of the national capital; but it has already been shown that it generally makes but a small part, and always the most unprofitable part of it. 
 
[paragraph 1487] 
 When, by any particular sum of money, we mean not only to express the amount of the metal pieces of which it is composed, but to include in its signification some obscure reference to the goods which can be had in exchange for them, the wealth or revenue which it in this case denotes, is equal only to one of the two values which are thus intimated somewhat ambiguously by the same word, and to the latter more properly than to the former, to the money’s worth more properly than to the money. 
 
 

In [None]:
def main():
    filename = "pg3300.txt"
    target_word = "Gutenberg"
    directory_path = "./"
    query1 = "What is the function of money?"
    query2 = "How taxes influence Economics?"
    # processor = DataProcessing(input_filename)

    # ONLY FOR TESTING - Remove all generated files
    # delete_all_files()

    # STEP 1 - Partition all chunks into list of paragraphs
    paragraphs = partition(filename)

    # STEP 2 - Filter collection
    #processed_collection = preprocess_collection(target_word, directory_path)
    #processed_query = preprocess_query(query1)

    # STEP 3 - Tokenize
    #tokenized_collection = tokenize(processed_collection)
    #tokenized_query = tokenize_doc(processed_query)

    # STEP 4 - Run similarity check
    #similarity(tokenized_query, tokenized_collection)


In [None]:
if __name__ == "__main__":
    main()