## There are some instructions you need to follow:
<li> You only need to write your code in the comment area "Your Code Here".</li>
<li>Do not upload your own file. Please make the necessary changes in the Jupyter notebook file already present in the server.</li>
<li>Please note, there are several cells in the Assignment Jupyter notebook that are empty and read only. Do not attempt to remove them or   edit them. They are used in grading your notebook. Doing so might lead to 0 points.</li>

In [11]:
import nltk
import os
import _sqlite3
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize,word_tokenize 
from gensim import corpora, models, similarities
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.similarities.docsim import Similarity

# Question 1

In [12]:
"""
Question 1

Write a function that takes the file name of the Wikipedia page containing all Greek ancient
philosophers (saved as "Index.html" in your workspace) and returns a list tuples containing 
the name of the philosopher and the path to its individual article file.

Example of use: get_philosophers("Index.html")

The output should be a list of tuples:

[('Acrion', 'Philosophers/Acrion.html'),
 ('Adrastus of Aphrodisias', 'Philosophers/Adrastus of Aphrodisias.html'),
 ('Aedesia', 'Philosophers/Aedesia.html'),
 ('Aedesius', 'Philosophers/Aedesius.html'),
 ('Aeneas of Gaza', 'Philosophers/Aeneas of Gaza.html'),
 ('Aenesidemus', 'Philosophers/Aenesidemus.html'),
 ...]
 
  
NOTE: For processing speed purposes, the table in "Index.html" has been shortened compared
to the one online on wikipedia.org. Do not worry if you do not find some philosophers in 
your results, this is made on purpose. 

"""

def get_philosophers(filename):
    
    import codecs
    from bs4 import BeautifulSoup
    f = codecs.open(filename, 'r', 'utf-8')
    soup = BeautifulSoup(f.read(),'lxml')
    
    philosopher_list = []
    
    tableRows = soup.find_all('table',class_='wikitable sortable')[0]
    data_rows = tableRows.find_all('tr')[1:]
    philosopher_name = [] #list containing each philosopher's name
    # grabs philosopher name from title an appends to list
    for data in data_rows:
        philosopher_name.append(data.find('a').get('title'))

    # adding path and filename at the end of philosopher's name
    for name in philosopher_name:
        x = name
        y = name,"Philosophers/"+ x + '.html'
        philosopher_list.append(y)

    # returns philosopher list as tuple
    return philosopher_list

# Once done, try this:
filenames = get_philosophers("Index.html")
# filenames

# Question 2

In [13]:
"""
Question 2


Write a function that scrapes the text on a philosophers’s page and returns it as a text 
string. The input is the name of the file that contains the philosoph's page.

Example of use: get_text('Philosophers/Acrion.html')
should output the text of the page.
'Acrion was a Locrian and a Pythagorean philosopher...'
"""

def get_text(file):
    import codecs
    from bs4 import BeautifulSoup
    f = codecs.open(file, 'r', 'utf-8')
    page_soup = BeautifulSoup(f.read(),'lxml')

    all_text = ""

    tag = page_soup.find('p')
    all_text += tag.get_text()
    return all_text
# Once done, try this:
get_text("Philosophers/Acrion.html")

'Acrion was a Locrian and a Pythagorean philosopher.[1]  He is mentioned by Valerius Maximus[2] under the name of Arion. According to William Smith, Arion is a false reading, instead of Acrion.[3]\n'

# Question 3

In [26]:
"""
Question 3

Use the files under "Philosophers" folder to construct an LSI model.
Then, use the LSI model to find the most similar philosopher for each of the philosophers
found in Question 1, based on the content of their Wikipedia articles. You should not go
online to scrape the data; everything you need is in your Jupyter notebook working directory.

The function should have as input the list of tuples created in Question 1.

The output format should be a list of tuples too. Each tuple should contain a philosopher's name
and its most similar other philosopher. Please note both names can't be the same.

The output should look like that:

[('Acrion', 'Arignote'),
 ('Adrastus of Aphrodisias', 'Lycophron (Sophist)'),
 ('Aedesia', 'Heliodorus of Alexandria'),
 ('Aedesius', 'Chrysanthius'),
 ('Aeneas of Gaza', 'Archytas'),
 ...]


"""

def run(filenames):
    # return philosopher's filepath 
    philosopher_files = list()
    for filename in filenames:
        philosopher_files.append(filename[1])
     
    # grab philosophers' description
    documents = [get_text(philosopher_path) for philosopher_path in philosopher_files]
    
    # text processing
    strip_text_list = list()

    for philosopher_text in documents:
        striptext = philosopher_text.replace('\n\n', ' ')
        striptext = striptext.replace('\n', ' ')
        strip_text_list.append(striptext)
        
    # process text and split each word
    texts = [[word for word in document.lower().split()
         if word not in STOPWORDS and word.isalnum()]
         for document in strip_text_list]
    
    # create a dictionary (tokenizing)
    dictionary = corpora.Dictionary(texts)
    
    # create corpus
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    
    # create LSI model
    lsi = models.LsiModel(corpus, id2word=dictionary,num_topics=7)
    
    # create list of philosophers description
    philosophers_description = []
    for philosopher_strip_text in strip_text_list:
        philosophers_description.append(philosopher_strip_text.split())

    # create bag of word for each philosopher
    philosophers_bow = []
    for single_description in philosophers_description:
        single_isalnum = []
        for word in single_description:
            if word.isalnum() == True:
                single_isalnum.append(word)
        philosophers_bow.append(single_isalnum)
        
    # vectorize philosopher bows
    sims_list = []
    for each_philosopher_bow in philosophers_bow:
        vec_bow = dictionary.doc2bow(each_philosopher_bow)
        vec_lsi = lsi[vec_bow]
        index = similarities.MatrixSimilarity(lsi[corpus])
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims),key=lambda item: -item[1])
        sims_list.append(sims)
        
    # find position of all sims then name of philosopher
    sim_position_list = []
    for sim_position in sims_list:
        sim_position_list.append(sim_position[0][0])

    sim_philosopher_list = []
    for sim_position in sim_position_list:
        sim_philosopher_list.append(filenames[sim_position][0])
    
    # append philosopher name and similar philosopher to list
    philosopher_names = []
    for name in filenames:
        philosopher_names.append(name[0])

    final_output = [(x,y) for x,y in zip(philosopher_names,sim_philosopher_list)]
    return final_output
# Once done, try this:
run(filenames)

[('Acrion', 'Anaxilaus'),
 ('Adrastus of Aphrodisias', 'Alexander of Aegae'),
 ('Aedesia', 'Aedesia'),
 ('Aedesius', 'Asclepigenia'),
 ('Aeneas of Gaza', 'Aeneas of Gaza'),
 ('Aenesidemus', 'Aenesidemus'),
 ('Aesara', 'Apollonius Cronus'),
 ('Aeschines of Neapolis', 'Attalus (Stoic)'),
 ('Aeschines of Sphettus', 'Aeschines of Sphettus'),
 ('Aetius of Antioch', 'Aetius of Antioch'),
 ('Agapius (philosopher)', 'Asclepiodotus of Alexandria'),
 ('Agathobulus', 'Agathobulus'),
 ('Agathosthenes', 'Agathosthenes'),
 ('Agrippa the Skeptic', 'Agrippa the Skeptic'),
 ('Albinus (philosopher)', 'Albinus (philosopher)'),
 ('Alcinous (philosopher)', 'Alcinous (philosopher)'),
 ('Alcmaeon of Croton', 'Alcmaeon of Croton'),
 ('Alexamenus of Teos', 'Alexicrates'),
 ('Alexander of Aegae', 'Alexander of Aegae'),
 ('Alexander of Aphrodisias', 'Alexander of Aphrodisias'),
 ('Alexicrates', 'Alexicrates'),
 ('Alexinus', 'Alexinus'),
 ('Amelius', 'Asclepigenia'),
 ('Ammonius Hermiae', 'Apollonius of Tyana'),


In [15]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


In [16]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


In [17]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###
