In [2]:
import os
from bs4 import BeautifulSoup
import re
from nltk.util import ngrams

In [3]:
loaded_documents = []

DATA_DIR = "./reuters/"
EXTENSION = ".sgm"

print(os.listdir(DATA_DIR))

for file in os.listdir(DATA_DIR):
    if file.endswith(EXTENSION):
        filename = os.path.join(DATA_DIR, file)
    
        with open(filename, mode='r', encoding='utf-8', errors='ignore') as data_file:
            data = data_file.read()

            soup = BeautifulSoup(data, 'html.parser')
            contents = soup.findAll('body')

            for content in contents:
                loaded_documents.append(content.text)


['all-exchanges-strings.lc.txt', 'all-orgs-strings.lc.txt', 'all-people-strings.lc.txt', 'all-places-strings.lc.txt', 'all-topics-strings.lc.txt', 'cat-descriptions_120396.txt', 'feldman-cia-worldfactbook-data.txt', 'lewis.dtd', 'README.txt', 'reut2-000.sgm', 'reut2-001.sgm', 'reut2-002.sgm', 'reut2-003.sgm', 'reut2-004.sgm', 'reut2-005.sgm', 'reut2-006.sgm', 'reut2-007.sgm', 'reut2-008.sgm', 'reut2-009.sgm', 'reut2-010.sgm', 'reut2-011.sgm', 'reut2-012.sgm', 'reut2-013.sgm', 'reut2-014.sgm', 'reut2-015.sgm', 'reut2-016.sgm', 'reut2-017.sgm', 'reut2-018.sgm', 'reut2-019.sgm', 'reut2-020.sgm', 'reut2-021.sgm', 'text-extractions.ipynb']


In [4]:
documents = loaded_documents

def decontract(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_doc(text):
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE) # remove links
    text = re.sub(r'<.*?>', '', text, flags=re.MULTILINE) # remove http tags
    text = decontract(text) #decontract
    text = text.lower() # lowercase
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove special characters
    return text

documents = list(map(clean_doc, documents))


In [5]:
def wordcount_filter(text):
    if text is None:
        return False
    return len(text.split()) >= 700

filtered_documents = list(filter(wordcount_filter, documents))
print(len(filtered_documents))

237


In [6]:
def generate_ngrams(documents, n):
    n_grams = []
    for doc in documents:
        tokens = [token for token in doc.split(" ") if token != ""]
        output = set(ngrams(tokens, n))
        n_grams.append(output)
    return n_grams

def calc_similarity(doc_id, documents, ngrams, top_count):
    similarity_map = {}

    ngrams_for_document = ngrams[doc_id]

    # calc Jaccard Similarity
    for i in range(len(documents)):
        if i != doc_id:
            ngrams_for_other_doc = ngrams[i]
            jaccard_similarity = len(ngrams_for_document.intersection(ngrams_for_other_doc)) / (len(ngrams_for_document.union(ngrams_for_other_doc)))
            similarity_map[i] = jaccard_similarity*100
    
    # Extract top
    sorted_similarity_map = sorted(similarity_map.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_similarity_map[:top_count]


In [7]:
calculated_ngrams = generate_ngrams(filtered_documents, 3)

best_last_similarity = 0
best_index = -1

for i, doc in enumerate(filtered_documents):
    similarity = calc_similarity(i, filtered_documents, calculated_ngrams, 9)
    
    if similarity[-1][1] > best_last_similarity:
        best_last_similarity = similarity[-1][1]
        best_index = i

best_doc_similarities = calc_similarity(best_index, filtered_documents, calculated_ngrams, 9)

print(f'Best index: {best_index}. Related documents: {best_doc_similarities}')

Best index: 173. Related documents: [(175, 86.89740420271941), (183, 3.6036036036036037), (198, 2.172338884866039), (125, 2.072192513368984), (62, 2.030456852791878), (197, 1.6869095816464237), (109, 1.6284233900814211), (148, 1.5460295151089247), (102, 1.4522821576763485)]


In [49]:
# Best documents:
# 173, 175, 183, 198, 125, 62, 197, 109, 148, 102

top officials of leading industrial nations appear deeply worried that financial markets have ignored their efforts to coordinate policies which they believe they strengthened in talks last week monetary sources said officials were exasperated that the markets which drove the dollar rapidly lower and severely disrupted bond and stock markets too did not take heed of the policy commitments of the group of seven the united states japan west germany france britain italy and canada treasury secretary james baker went out of his way to reassure markets of his commitment to a stable dollar with a statement and french finance minister edouard balladur underscored that by saying i do not believe at all that the americans want a weaker dollar west german finance minister gerhard stoltenberg said the dollar is latest rapid descent involves the risk now already a tangible threat of a new strong surge of inflation leading to a renewed rise in interest rates but there were signs too that while poli

In [9]:
best_related_documents = [best_index]

for doc in best_doc_similarities:
    best_related_documents.append(doc[0])

print(best_related_documents)

[173, 175, 183, 198, 125, 62, 197, 109, 148, 102]


In [10]:
for doc_id in best_related_documents:
    with open(f'./documents/{doc_id}.txt', mode='w') as file:
        file.write(filtered_documents[doc_id])