In [130]:
import nltk
import os
import _sqlite3
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize,word_tokenize 
from gensim import corpora, models, similarities
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.similarities.docsim import Similarity

In [131]:
def get_philosophers(filename):
    
    import codecs
    from bs4 import BeautifulSoup
    f = codecs.open(filename, 'r', 'utf-8')
    soup = BeautifulSoup(f.read(),'lxml')
    
    philosopher_list = []
    
    tableRows = soup.find_all('table',class_='wikitable sortable')[0]
    data_rows = tableRows.find_all('tr')[1:]
    philosopher_name = [] #list containing each philosopher's name
    # grabs philosopher name from title an appends to list
    for data in data_rows:
        philosopher_name.append(data.find('a').get('title'))

    # adding path and filename at the end of philosopher's name
    for name in philosopher_name:
        x = name
        y = name,"Philosophers/"+ x + '.html'
        philosopher_list.append(y)

    # returns philosopher list as tuple
    return philosopher_list

# Once done, try this:
filenames = get_philosophers("Index.html")
# filenames

# Single-P Tag

In [132]:
# def get_text(file):
#     import codecs
#     from bs4 import BeautifulSoup
#     f = codecs.open(file, 'r', 'utf-8')
#     page_soup = BeautifulSoup(f.read(),'lxml')

#     all_text = ""

#     tag = page_soup.find('p')
#     all_text += tag.get_text()
#     return all_text
# get_text("Philosophers/Adrastus of Aphrodisias.html")

# Multiple-P tag

In [133]:
def get_text(file):
    import codecs
    from bs4 import BeautifulSoup
    f = codecs.open(file, 'r', 'utf-8')
    page_soup = BeautifulSoup(f.read(),'lxml')

    all_text = ""

    for tag in page_soup.find_all('p'):
        all_text += tag.get_text()
    return all_text
# Once done, try this:
get_text("Philosophers/Adrastus of Aphrodisias.html")

'Adrastus of Aphrodisias (Greek: Ἄδραστος ὁ Ἀφροδισιεύς; fl. 2nd century) was a Peripatetic philosopher who lived in the 2nd century AD.  He was the author of a treatise on the arrangement of Aristotle\'s writings and his system of philosophy, quoted by Simplicius,[1] and by Achilles Tatius.  Some commentaries of his on the Timaeus of Plato are also quoted by Porphyry,[2] and a treatise on the Categories of Aristotle by Galen. None of these have survived.[3] He was a competent mathematician, whose writings on harmonics are frequently cited by Theon of Smyrna in the surviving sections of his On Mathematics Useful for the Understanding of Plato.[4]  In the 17th century, a work by Adrastus on harmonics, Περὶ Ἁρμονικῶν ("On Harmonics"), was said by Gerhard Johann Vossius to have been preserved, in manuscript, in the Vatican Library, although the manuscript appears to be no longer extant, if indeed this was not an error on Vossius\' part.[5]\nAdrastus of Philippi is also reported by Stephan

In [134]:
# list of all philosophers' file path
philosopher_files = list()
for filename in filenames:
#     grabs filepath of philosopher
    philosopher_files.append(filename[1])

# list of philosophers' text 
documents = [get_text(philosopher_path) for philosopher_path in philosopher_files]

In [135]:
# # text preprocessing
# # philosopher_text_list = contains multiple indexes of philosophers' description
# strip_text_list = list()

# for philosopher_text in documents:
#     striptext = philosopher_text.replace('\n\n', ' ')
#     striptext = striptext.replace('\n', ' ')

# ##     tokenize all texts: only used when creating LDA (sent_tokenize)
# #     sentences = sent_tokenize(striptext)

#     strip_text_list.append(striptext)
# documents

In [136]:
# lowercase all characters, split the words and remove all special characters
# removed: "word not in STOPWORDS and"
texts = [[word for word in document.lower().split()
         if word.isalnum()]
         for document in documents]

# create word dictionary for each word in a text 
# tokenizing
dictionary = corpora.Dictionary(texts)

# create corpus: assigns ints to words
# https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2bow
# returns (token_id,token_count)
corpus = [dictionary.doc2bow(text) for text in texts]


# texts

In [139]:
# train LSI model
# id2word takes a dictionary of words
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
   
# create list of philosophers description
philosophers_description = []
for philosopher_strip_text in strip_text_list:
    philosophers_description.append(philosopher_strip_text.split())
   
# creating bow for all philosophers
philosophers_bow = []
for single_description in philosophers_description:
    single_isalnum = []
    for word in single_description:
        if word.isalnum() == True:
            single_isalnum.append(word)
    philosophers_bow.append(single_isalnum)

# vectorize new bow (bag of words)
sims_list = []
for each_philosopher_bow in philosophers_bow:
    vec_bow = dictionary.doc2bow(each_philosopher_bow)
    vec_lsi = lsi[vec_bow]
    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims),key=lambda item: -item[1])
    sims_list.append(sims)

# print(lsi.show_topics(num_topics=-1))
# philosophers_bow[0]

# Output most similar philosopher

In [140]:
# find position of all sims then name of philosopher
sim_position_list = []
for sim_position in sims_list:
    sim_position_list.append(sim_position[0][0])

sim_philosopher_list = []
for sim_position in sim_position_list:
    sim_philosopher_list.append(filenames[sim_position][0])

philosopher_names = []
for name in filenames:
    philosopher_names.append(name[0])
final_output = [(x,y) for x,y in zip(philosopher_names,sim_philosopher_list)]
final_output


[('Acrion', 'Acrion'),
 ('Adrastus of Aphrodisias', 'Archytas'),
 ('Aedesia', 'Aedesia'),
 ('Aedesius', 'Agapius (philosopher)'),
 ('Aeneas of Gaza', 'Tisias'),
 ('Aenesidemus', 'Xenophanes'),
 ('Aesara', 'Aesara'),
 ('Aeschines of Neapolis', 'Apollodorus of Athens'),
 ('Aeschines of Sphettus', 'Agathosthenes'),
 ('Aetius of Antioch', 'Aristippus the Younger'),
 ('Agapius (philosopher)', 'Attalus (Stoic)'),
 ('Agathobulus', 'Xenophanes'),
 ('Agathosthenes', 'Anaxilaus'),
 ('Agrippa the Skeptic', 'Xenocrates'),
 ('Albinus (philosopher)', 'Zenobius'),
 ('Alcinous (philosopher)', 'Basilides (Stoic)'),
 ('Alcmaeon of Croton', 'Aristippus'),
 ('Alexamenus of Teos', 'Albinus (philosopher)'),
 ('Alexander of Aegae', 'Alexinus'),
 ('Alexander of Aphrodisias', 'Atticus (philosopher)'),
 ('Alexicrates', 'Xeniades'),
 ('Alexinus', 'Zenodotus (philosopher)'),
 ('Amelius', 'Amelius'),
 ('Ammonius Hermiae', 'Anaxarchus'),
 ('Ammonius of Athens', 'Anniceris'),
 ('Ammonius Saccas', 'Aristo of Alexandr