In [7]:
import json

with open("aquinas_new.json", "r") as handle:
    data = json.load(handle)
data[0]

{'volume': 'Volume 1',
 'volumeKey': 'v1',
 'questionTitle': 'Question 1. The nature and extent of sacred doctrine',
 'question': 'Question 1.',
 'questionKey': 'q1',
 'articleTitle': 'Article 1. Whether, besides philosophy, any further doctrine is required?',
 'article': 'Article 1.',
 'articleKey': 'a1',
 'articleObjections': ['Objection 1. It seems that, besides philosophical science, we have no need of any further knowledge. For man should not seek to know what is above reason: "Seek not the things that are too high for thee" (Sirach 3:22). But whatever is not above reason is fully treated of in philosophical science. Therefore any other knowledge besides philosophical science is superfluous.',
  'Objection 2. Further, knowledge can be concerned only with being, for nothing can be known, save what is true; and all that is, is true. But everything that is, is treated of in philosophical science—even God Himself; so that there is a part of philosophy called theology, or the divine sc

In [11]:
corpus = []
for article in data:
    questionTitle = article["questionTitle"]
    articleTitle = article["articleTitle"]
    objections = article["articleObjections"]
    replyBody = article["articleBody"]
    replyObjections = article["articleReplyToObjections"]
    content = [questionTitle, 
                         articleTitle, 
                         "\n".join([i for i in objections]),  
                         "\n".join([i for i in replyBody]), 
                         "\n".join([i for i in replyObjections]) ]
    content = "\n".join([_ for _ in content])
    corpus.append(content)
len(corpus)

3148

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
fit = vectorizer.fit(corpus)
X = fit.transform(corpus)
X.shape

(3148, 18878)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

S = cosine_similarity(X)
S.shape

(3148, 3148)

In [14]:
S

array([[1.        , 0.4542111 , 0.46876597, ..., 0.14597611, 0.1963869 ,
        0.20251295],
       [0.4542111 , 1.        , 0.6913056 , ..., 0.10421883, 0.13060916,
        0.1421492 ],
       [0.46876597, 0.6913056 , 1.        , ..., 0.143358  , 0.15990545,
        0.1941399 ],
       ...,
       [0.14597611, 0.10421883, 0.143358  , ..., 1.        , 0.48288614,
        0.58769024],
       [0.1963869 , 0.13060916, 0.15990545, ..., 0.48288614, 1.        ,
        0.47758759],
       [0.20251295, 0.1421492 , 0.1941399 , ..., 0.58769024, 0.47758759,
        1.        ]])

In [14]:
import numpy as np
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(S)
X_embedded.shape

(3148, 2)

In [16]:
data_with_embeddings = []
for i, article in enumerate(data):
    article["embedding"] = {"x1": X_embedded[i,0], "x2": X_embedded[i,1]}
    data_with_embeddings.append(article)
data_with_embeddings[0]

{'volume': 'Volume 1',
 'volumeKey': 'v1',
 'questionTitle': 'Question 1. The nature and extent of sacred doctrine',
 'question': 'Question 1.',
 'questionKey': 'q1',
 'articleTitle': 'Article 1. Whether, besides philosophy, any further doctrine is required?',
 'article': 'Article 1.',
 'articleKey': 'a1',
 'articleObjections': ['Objection 1. It seems that, besides philosophical science, we have no need of any further knowledge. For man should not seek to know what is above reason: "Seek not the things that are too high for thee" (Sirach 3:22). But whatever is not above reason is fully treated of in philosophical science. Therefore any other knowledge besides philosophical science is superfluous.',
  'Objection 2. Further, knowledge can be concerned only with being, for nothing can be known, save what is true; and all that is, is true. But everything that is, is treated of in philosophical science—even God Himself; so that there is a part of philosophy called theology, or the divine sc

In [None]:
import json
with open("aquinas_new_with_embeddings.json", "w") as handle:
    json.dump(data_with_embeddings, handle)

In [40]:
import numpy as np
topHits = S[0,:].argsort()[::-1][:10]

In [41]:
for hit in topHits:
    print(data[hit]["articleTitle"])

Article 1. Whether, besides philosophy, any further doctrine is required?
Article 7. Whether God is the object of this science?
Article 4. Whether it is necessary to believe those things which can be proved by natural reason?
Article 3. Whether sacred doctrine is one science?
Article 4. Whether sacred doctrine is a practical science?
Article 2. Whether sacred doctrine is a science?
Article 5. Whether those things that are of faith can be an object of science [Science is certain knowledge of a demonstrated conclusion through its demonstration]?
Article 5. Whether sacred doctrine is nobler than other sciences?
Article 6. Whether this doctrine is the same as wisdom?
Article 2. Whether there are only three habits of the speculative intellect, viz. wisdom, science and understanding?


In [15]:
similarity_records = []
for i in range(len(S)):
    topHits = S[i,:].argsort()[::-1][:10]
    simObject = {"volumeKey":data[i]["volumeKey"], 
                 "questionKey":data[i]["questionKey"],
                 "articleKey":data[i]["articleKey"]}
    ranks = []
    j=0
    for hit in topHits:
        if j == 0:
            j+=1
            continue
        
        ranks.append({"rank":j,
                      "volumeKey":data[hit]["volumeKey"], 
                      "questionKey":data[hit]["questionKey"],
                      "articleKey":data[hit]["articleKey"]})
        j+=1
    simObject["ranks"] = ranks
    similarity_records.append(simObject)
    
similarity_records[0]
print(len(similarity_records))

3148


In [16]:
with open("aquinas_similarity.json", "w") as handle:
    json.dump(similarity_records, handle)