In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ethanhaley/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# reference for help:
# https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630

In [3]:
stop_words_lower = set(stopwords.words('english'))

In [4]:
# make list of all files, one for each article in wikispeedia.
# NOTE: Hopefully there is an order/index here that matches with other files in project!
doctexts = os.listdir('./plaintext_articles/')

In [5]:
len(doctexts) # So far so good--this is the right number of articles at least

4604

Check if the indexes align.  `articles` file has index 6 attached to "10th_century" article

In [6]:
# hopefully "10th_century.txt"
doctexts[6]

'John_W._Campbell.txt'

OK well that's not good.  Next best hope is that "10th_century", exactly as in `articles`, at least has an exact name match in `doctexts`

In [7]:
"10th_century.txt" in doctexts

True

(breath of relief)

In [8]:
# Inspect and clean sample article
with open('plaintext_articles/10th_century.txt', 'r') as txt:
    doc = txt.readlines()

In [9]:
doc[:5]

['   #copyright\n',
 '\n',
 '10th century\n',
 '\n',
 '2007 Schools Wikipedia Selection. Related subjects: General history\n']

In [10]:
doc = ' '.join([line.strip('\n') for line in doc])

In [11]:
doc_clean = " ".join(re.sub(r'[^a-zA-Z]', ' ' , w).lower() for w in doc.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_lower)
doc_clean = re.sub(r'\s+', ' ' , doc_clean)
doc_clean

' copyright th century schools wikipedia selection related subjects general history millennium st millennium bc st millennium nd millennium centuries th century th century th century decades s s s s s s s s s s means recording passage time th century century lasted overview tenth century usually regarded low point european history china also period political upheaval islamdom however cultural zenith especially spain medievalist historian technology lynn white said to modern eye nearly darkest dark ages concluded dark darkness womb similarly helen waddell wrote tenth century in textbooks disputes seventh bad eminence nadir human intellect even fifteenth century lorenzo valla described century lead iron events statue yogini goddess created kaveripakkam tamil nadu india th century enlarge statue yogini goddess created kaveripakkam tamil nadu india th century beginning medieval warm period viking groups settle northern france norse become normans foundation cluny first federated monastic o

**We've removed all numbers there.  Seems like 4 digit years would be meaningful, but then what about 3 digit years like in this tenth-century article?  And how much info is gained by having specific years in the article, semantically?  All those individual "s"es, by the way, are from decades.  Are those meaningful without just being a central node through which any unrelated topic can pass?  Very open for debate....**

To join each article with other data in other notebooks, we'll make a DataFrame with `article` column and cleaned text column.  The word "cleaned" seems a bit dirty at this point, but nevermind.

In [12]:
# collect all article names and cleaned texts in lists to make a DF
# (((((THIS IS A BIT SLOPPY, BOTH PROCESS AND CODE, SO TAKES ABOUT 30-60 SECS)))))
names = []
texts = []
for dt in doctexts:
    # read in single textfile, into a list of lines
    with open(f'plaintext_articles/{dt}', 'r') as txt:
        doc = txt.readlines()
    # remove endlines
    doc = ' '.join([line.strip('\n') for line in doc])
    # eliminate stopwords and non-alpha characters
    doc_clean = " ".join(re.sub(r'[^a-zA-Z]', ' ' , w).lower() for w in doc.split() 
                         if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_lower)
    # reduce long whitespace to single spaces
    doc_clean = re.sub(r'\s+', ' ' , doc_clean)
    # append article name and "clean" text to lists
    names.append(dt.rstrip('.txt'))
    texts.append(doc_clean)

In [13]:
len(names) == len(texts) == 4604

True

In [14]:
docDF = pd.DataFrame({'article': names, 'texts': texts})

In [15]:
docDF.head(2)

Unnamed: 0,article,texts
0,Second_Crusade,copyright second crusade schools wikipedia se...
1,Navassa_Island,copyright navassa island schools wikipedia se...


In [16]:
docDF.texts[0][:100]

' copyright second crusade schools wikipedia selection related subjects military history war religiou'

That constant reflexivity of "copyright" and "wikipedia" is probably just the tip of the annoying iceberg, but let's erase it since we easily can.

In [17]:
docDF.texts = docDF.texts.apply(lambda txt: txt.replace('copyright', '').replace
                                ('schools wikipedia selection related subjects', ''))

In [18]:
docDF.head(2)

Unnamed: 0,article,texts
0,Second_Crusade,second crusade military history war religio...
1,Navassa_Island,navassa island north american geography nav...


In [19]:
# clean up memory
texts = ''
names = ''

**Use Tfidf to encode documents for similarities**

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [28]:
t_vec=TfidfVectorizer()
t_vec.fit(docDF.texts)
tfidf_vectors=t_vec.transform(docDF.texts)
# outer product to get all similarities into a matrix
pairwise_similarities=np.dot(tfidf_vectors, tfidf_vectors.T).toarray()
# somehow the euc.dists build in the outer product step??
pairwise_differences=euclidean_distances(tfidf_vectors)

In [31]:
# adapted/stolen from https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630
def most_similar_N(doc_id, similarity_matrix, matrix, N):
    print(f'Document: {docDF.iloc[doc_id]["article"]}\n')
    print('Similar Documents:')
    if matrix == 'Cosine Similarity':
        similar_ix = np.argsort(similarity_matrix[doc_id])[::-1][:N+1]
    elif matrix == 'Euclidean Distance':
        similar_ix = np.argsort(similarity_matrix[doc_id])[:N+1] # same as previous line, essentially
    for ix in similar_ix:
        if ix == doc_id:  # don't measure self vs self
            continue
        print('\n')
        print(f'Document: {docDF.iloc[ix]["article"]}')
        print(f'{matrix} : {similarity_matrix[doc_id][ix]}')

# Second Crusade article, for example
most_similar_N(0, pairwise_similarities, 'Cosine Similarity', 20)
print("===========")
print("=========== Check Euclidean distances, in reverse, i.e. same order as above")
print("===========")
most_similar_N(0, pairwise_differences, 'Euclidean Distance', 20)       

Document: Second_Crusade

Similar Documents:


Document: First_Crusade
Cosine Similarity : 0.5637263879771479


Document: Crusades
Cosine Similarity : 0.5085422037504989


Document: Manuel_I_Komnenos
Cosine Similarity : 0.385435350924748


Document: Children%27s_Crusade
Cosine Similarity : 0.37271318568969597


Document: German_Crusade%2C_1096
Cosine Similarity : 0.3373274276031265


Document: Byzantine_Empire
Cosine Similarity : 0.2982105637354369


Document: Siege_of_Antioch
Cosine Similarity : 0.28376038533800585


Document: Frederick_II%2C_Holy_Roman_Emperor
Cosine Similarity : 0.2537583205723986


Document: Albigensian_Crusade
Cosine Similarity : 0.23347395069057522


Document: Louis_XIV_of_France
Cosine Similarity : 0.2135986288889125


Document: Richard_I_of_England
Cosine Similarity : 0.19852069910136796


Document: Jerusalem
Cosine Similarity : 0.1903148584833818


Document: List_of_French_monarchs
Cosine Similarity : 0.19025322019136176


Document: Damascus
Cosine Similarity 

**That just prints out the most similar documents, without assigning them to any variable.  The Euclidean Distance is just ordered opposite from the Cosine Similarity, so they measure the same thing, in reverse, but the scale is different.**

### TODO -- Try to measure document similarity with BERT.  Installing `sentence-transformers` and then importing `sentence_transformers` didn't find the module that installed