# Extracting and storing RDF information and text

In [1]:
# # internal note: uncomment these rows to install packages... colaboratory says they 
# # are not present. we only get max 12 hour connection to the cloud each time. 
# # whenever we reconnect, we get a new instance of the virtual machine, which 
# # won't have these non-standard packages. 
# ! pip install wikipedia
# ! pip install wptools
# ! pip install SPARQLWrapper
# ! pip install textblob
# ! pip install nltk

import requests
from bs4 import BeautifulSoup
import wptools, wikipedia
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph
import csv, datetime, time, random, collections, string, re, urllib  
from textblob import TextBlob
from pymongo import MongoClient

### General overview

1.  2 sets of classes - 1 to handle PG author-title mining, filtering and selection. The other to pre-process author
2. 

In [None]:
# TO DO LIST: 

# 2. write a function to randomly select min_books number of books from each author's pool of books. Effectively, this only affects 
# authors with more than the min_books level specified. 

# 4. complete populate_corpus in GutenbergCorpusBuilder

# 5. complete the database question - MongoDB (2 options - load a full GutenbergCorpusBuilder output, or author by author)

# 6. review and add to documentation and comments for clarity and completeness 

# 7. draw a diagram of the database model 



## DONE 
# 1. Complete comments and docstring for _cleansegment_book 

# 3. complete _get_literarymovement, _get_authorabstract, _build_subcorpus, populate_attributes and write_to_file of the Author class (note that for _get_authorabstract
# the GutenbergCorpusBuilder's _get_bookswiki_info method has been expanded to grab wikipedia pages on the PG website, where available. This can be passed into 
# the wikipedia package to easily get the pages. However, a small problem exists, some of PG's wikilinks uses non-unicode symbols 
# e.g. href="http://en.wikipedia.org/wiki/Fern%C3%A1n_Caballero" passing "Fern%C3%A1n_Caballero" takes us to the wrong wikipedia page (solved - with urllib.requests.unquote)


### 1. A class to store a corpus obtained from the Project Gutenberg website. 

1. The corpus is build with functions within the class that filter the authors and books on the Project Gutenberg website. 
2. It also calls on the Author class, to process and generate information about sentences from an author's books. 

In [None]:
class GutenbergCorpusBuilder: 
    
    def __init__(self, corpusname):
        '''
        initiates a Gutenberg object which stores information about selected authors available on the 
        PG website. Authors are stored based on their unique PG numerical code. For each author,  
        filtered books and their respective PG URL is also stored. 
        '''
        self.corpusname = corpusname
        self.corpusversion = "v"+ str(datetime.datetime.now().year) + str(datetime.datetime.now().month) +\
        str(datetime.datetime.now().day)
        
        self.authors = {}
        # a dictionary nested with dictionaries. the top level keys are the unique numbers for authors 
        # on the Project Gutenberg website, the values are dictionaries containing author information,
        # names, books (in a dictionary). 
        
        self.corpus = {}
        # a dictionary containing sets of sentences selected from each author's filtered books.
        # the top level keys are the unique numbers for authors, the values are sets containing 
        # sentences from an author's books (as strings. 
        
    
    def populate_corpus(self):
        '''
        for each of the author in self.authors, generate an Author class instance, populates all 
        attributes of the Author class, adds to self.corpus.
        
        inputs | 
        outputs | 
        
        '''
        if len(self.author) >= 0:
            for author in self.author:
                __author = Author()
                __author.populate_attributes()
        pass 
        
    
    def get_library(self, min_books = 1, languages = "all", roles = "all"):
        '''
        
        Goes through the PG website's 'sort by author' pages. Extracts author and corresponding book 
        information that meet a number of selection criterion (see inputs). 
        inputs | 
        1. min_books:int - the minimum number of books available for an author, which meets the languages 
        and roles parameters. default value is 1. 
        5. languages: either a str "all", or a list containing the languages (in lowercase) to count towards 
        the author's min_books level. The list of languages available can be found here 
        https://www.gutenberg.org/catalog/. default is "all". 
        6. roles: either a str "all", or a list containing the roles that an author can have in a book. 
        These include: Commentator, Translator, Contributor, Photographer, Illustrator, Commentator, Editor
        default value is "all".
        outputs | saves the results to self.authors
        
        '''
        charlist = []
        charlist[:0] = [letter for letter in string.ascii_lowercase] + ["other"]

        library = dict()
        for char in charlist:
            # Team comment: we select the authors and books via the "Browse by Author" lists instead of the 
            # "Browse by Books" list. Although the latter has a more predictable page structure 
            # (i.e. 1 book name, followed by 1 author name, recursively), the former includes 
            # information about the Author's role in the book. We believe that this could have
            # a meaningful impact on the predictive capabilities for models on different tasks, 
            # especially at larger scale.  
            link = 'https://www.gutenberg.org/browse/authors/'+ char
            page = requests.get(link)
            soup = BeautifulSoup(page.content, 'html.parser')
            one_letter = self._unite_authors_nums_books(self._get_authors_numsnames(soup)[0],\
                                                            self._get_authors_numsnames(soup)[1],\
                                                            self._get_bookswiki_info(soup)[0],\
                                                            self._get_bookswiki_info(soup)[1],\
                                                            min_books, languages, roles)
            
            library.update(one_letter)
            print("{} authors from the {} alphabetical category have been added. ".format(len(one_letter),char))
            
            # put the function to sleep for a randomised number of seconds (non-integer number between 2 and 8)
            # to mimic human surfing patterns. some ethical considerations here 
            time.sleep(random.uniform(2,8))
            
        self.authors = library 
    
    def _get_authors_numsnames(self, soup):
        '''
        A helper function for __unite_authors_nums_books__. Extracts all author names from a BeautifulSoup copy  
        of a 'Browse by Author' page on the PG website. 
        inputs | soup:a BeautifulSoup object - containg a copy of the PG 'Browse by Author' page. 
        outputs | a tuple containing two lists. the first contains author numbers on the page, the second contains
        corresponding author names on the page. 
        '''
        authornames = []
        # the author names are stored within the "name" attribute under each "a" class
        # use regex wildcard so that find_all will catch and return all "a names" with values
        authorname_BSlist = soup.find_all('a', {"name":re.compile("\w*")})

        for authorname in authorname_BSlist:
            # \- and \? to escape special characters. .rstrip to remove trailing whitespaces. 
            authornames.append(re.sub(r'[0-9,\-\?]*', '', authorname.text).rstrip())

        authornums = []
        # the author numbers are stored within the "href" attribute. Every line for a book 
        # on the page has a "title" attribute with the value "Link to this author". We will use
        # this to sift for only the lines with the the author number. 
        authornums_BSlist = soup.find_all('a', {"title":"Link to this author"})

        for authornum in authornums_BSlist:
            authornums.append(authornum["href"].lstrip("#"))

        return authornums, authornames

    def _get_bookswiki_info(self, soup):
        '''
        A helper function for __unite_authors_nums_books__. Extracts all the book titles and numbers from a 
        BeautifulSoup copy of a 'Browse by Author' page on the PG website. Also extracts author wikipedia 
        link information if it is available on the PG website. 
        inputs | soup:a BeautifulSoup object - containg a copy of the PG 'Browse by Author' page. 
        outputs | a tuple containing two lists. 
        1. The first list contains dictionaries. Each dictionary contains information about an author's books on PG. 
        this includes: book titles, corresponding PG books numbers, the author's role in each book, and the language 
        of each book. 
        2. The second list contains also contains dictionaries. Each dictionary contains information about an author's 
        wikipedia links on PG. An author's wiki dictionary may be empty, contain 1 link, or more than 1 link. 
        '''
        books_info = []
        wiki_info = []

        # content under the 'ul' tags: books, links as one list organized by ul
        authorsbooks_BSlist = soup.find_all('ul')
        # for each ul, access the content: books, links; each book is a bs object

        for author in authorsbooks_BSlist:
            # there are two classes of attributes within each ul tag. the book information
            # 1. title and book PG number is under the 'pgdbetext' class. 
            books_BSlist = author.find_all(class_='pgdbetext')

            authorbooks_info = {}
            for book in books_BSlist:
                # the book numbers are stored in the href attribute. e.g. "ebooks/19323"
                booknum = book.find('a')['href'].split("/")[-1]
                PG_booktitle = book.text

                # storing the information regarding a single author's books in a dictionary
                authorbooks_info[booknum]={"PG_booktitle":PG_booktitle}
            
            # appending the dictionary containing one author's books to a list
            books_info.append(authorbooks_info)
            
            # 2. for the author is/are under the 'pgdbxlink' class. 
            wiki_BSlist = author.find_all(class_='pgdbxlink')

            authorwiki_info = {}

            for wiki in wiki_BSlist:
                # 1. the wiki links are stored in the href attribute. 
                PG_wikilink = wiki.find('a')['href'] # get the whole link
                PG_wikiname = PG_wikilink.split("/")[-1] # get only the wikiname, to easily retrieve the page later 
                
                # 2. because PG stores the link in URL-safe format (e.g. "\x" is "%"), we will face issues with 
                # non-ASCII characters e.g. á whose URL-safe encoding cannot be passed into the wikipedia package. 
                # use urllib.requests.unquote to resolve this https://docs.python.org/2/library/urllib.html#utility-functions 
                PG_wikiname = urllib.request.unquote(PG_wikiname)
                
                # 3. get the language code for the wikipage
                wikilang = re.findall(r'[^http://][a-z]+', PG_wikilink)[0] 
                # storing the information regarding a single author's wikipedia links in a dictionary
                authorwiki_info[wikilang]={"PG_wikilink":PG_wikilink, "PG_wikiname":PG_wikiname}

            # appending the dictionary containing one author's wikipedia links to a list
            wiki_info.append(authorwiki_info)
            
        return books_info, wiki_info

    
    def _unite_authors_nums_books(self, authornums, authornames, books_info, wiki_info, min_books = 1, 
                                     languages = "all", roles = "all"):
        '''
        A helper function for get_library. 
        inputs | 
        1. authornums:list - list of author numbers obtained from a "sort by author" page on the PG website. 
        2. authornames:list - list of author names obtained from a "sort by author" page on the PG website. 
        3. books_info: list - a list containing dictionaries, each of which has information about one author's books 
        4. wiki_info: list - a list containing dictionaries, each of which has information about one author's wikipedia
        page, as provided by the PG website. There may be none, one, or more wikilinks for an author. 
        5. min_books:int - the minimum number of books available for an author, which meets the languages 
        and roles parameters. default value is 1 (since an author listed on PG will have at least 1 book to his name).
        6. languages:either a str "all", or a list containing the languages (in lowercase) to count towards the author's 
        min_books level. The list of languages available can be found here https://www.gutenberg.org/catalog/
        default is "all". 
        7. roles: either a str "all", or a list containing the roles (in lowercase) that an author can have in a book. 
        These include: commentator, translator, contributor, photographer, illustrator, commentator, editor
        default value is "all".
        outputs | a dictionary containing PG numbers for authors who meet the min_books, languages and roles requirements, 
        as well as information each of these author's books. 
        '''
        # we want to be sure that the authornums, authornames, books_info, and wiki_info are aligned before proceeding 
        # to merge them. 
        try:
            assert len(authornums)==len(authornames) and len(authornums)==len(books_info) and len(authornums)==len(wiki_info)
        except AssertionError as e:
            e.args += ("The length of authornums, authornames and books_info do not match.",)
            raise

            
        authorbooks_info = dict()
        # if default parameters passed into the function. add all authors and their books to the corpus.  
        if min_books == None and languages == "all" and roles == "all":
            for i in range(len(authornums)):
                authorbooks_info[authornums[i]]=\
                        {"authorname": authornames[i], "books_info": books_info[i], "wiki_info": wiki_info[i]}
        else:
            # place languages and roles input in sets, for use in .intersection below. 
            languages_set = set(languages)
            roles_set = set(roles)
            
            for i in range(len(authornums)):
                author_bookset = books_info[i]
                __topop = []
                for book in author_bookset: 
                    
                    # using regex to find text in parentheses. book language e.g. (English) and author role 
                    # e.g. (as Author) are contained in parentheses. Some books, that are part of a series, 
                    # have (of N) in their titles too, where N is the number of books in that series. 
                    title_text_in_parentheses =\
                    re.findall(r'\(([a-zA-Z]+\s*[a-zA-Z]*[0-9]*)\)', author_bookset[book]["PG_booktitle"])
                    
                    # lowercase the text in parentheses and putting into sets. 
                    __title_text_in_parentheses =\
                    set([i.lower() for i in title_text_in_parentheses])
                    
                    # if languages is set to "all" or if the intersection of __title_text_in_parentheses
                    # and languages_set returns a non-empty set, pass to next check. Otherwise add this 
                    # book number to the list of books to pop from this author_bookset
                    if languages == "all" or __title_text_in_parentheses.intersection(languages_set): pass
                    else: 
                        __topop.append(book) 
                        continue 
                    # similar logic as above, this time for author role.
                    if roles == "all" or __title_text_in_parentheses.intersection(roles_set): pass
                    else: 
                        __topop.append(book) 
                        continue    
                # pop the books that don't meet the language and role specifications. 
                for pop in __topop:
                    books_info[i].pop(pop)
                    
                #check if number of books meeting the language and role requirements meet the min_book requirement 
                if len(books_info[i]) >= min_books:
                    authorbooks_info[authornums[i]]=\
                            {"authorname": authornames[i], "books_info": books_info[i], "wiki_info": wiki_info[i]}
                    
        return authorbooks_info 
    
    def __str__(self):
        return "There are {} authors in this corpus".format(len(self.authors))

### 2. A class to store subcorpora obtained from the Project Gutenberg website for each Author. 

1. The subcorpus is build with functions within the class that pre-processes each .txt file for filtered author books on the Project Gutenberg website. 
2. It also obtains the abstracts and literary movement tags for each author from Wikipedia and DBPedia respectively. 

In [None]:
class Author:
    
    def __init__(self, authorbooks_info):
        '''
        initiates the Author object with the author's name. 
        input | str: author's name
        '''
        self.name = authorbooks_info["authorname"]
        self.wiki_info = authorbooks_info["wiki_info"]
        
        # a dictionary with the keys as the book number and the value as the title of the book. 
        self.books = {}
        
        # a dictionary with the keys as the book number and the value as a list 
        # (containing strings that have been pre-processed by the segment_sentence method)
        self.processed_subcorpus = {}        
        
        self.authorabstract = "" 
        self.literarymovements = []
        
    def populate_attributes(self):
        '''
        A convenience function to call _build_subcorpus, _get_authorabstract and  _get_literarymovement, 
        which will respectively populate the processed_subcorpus, authorabstract and literarymovements
        attributes for this Author instance.  
        input | nil
        output | nil 
        '''
        # _build_subcorpus
        #_get_authorabstract
        #_get_literarymovement
        
        pass
    
    def _build_subcorpus(self,):
        '''
        A helper function for .populate_attributes
        '''
        pass

    def _cleansegment_book(self, booknum, urlpath = "https://www.gutenberg.org/files/{}/{}.txt"):
        '''
        takes a booknum, navigates to the PG page with the .txt file for this book. uses urlopen to retrieve 
        the contents of this file. 
        once open, only retrieves lines before and "START" line include, do the same process with "END"
        removes lines 
        '''    

        target_url = urlpath.format(booknum,booknum)
        
        book_content = []
        # open target_url with the urllib.request.urlopen() method,
        # for each line in response, decodes with "latin-1" 
        # which is the expected encoding format PG uses for plain .txt book files. 
        
        with urllib.request.urlopen(target_url) as response: 
            for line in response: 
                # urlopen reads as bytes, to ease processing, we decode to string.
                # most PG .txt files are encoded in latin-1 format. 
                book_content.append(line.decode("latin-1"))
        
        
        start_index=0 #indice from the first part of the text
        stop_index=0  #indice from the second part of the text  
        
        # Each PG book .txt file is bookended with metadata marked with "* START" and "* END" or minor variations. 
        # * START-tagged metadata appear in the first half of the .txt file, and vice-versa for * END.
        # we split the file in two halves and run searches for * START and * END from front and back 
        # (for savings in search time)
        for index_num in range(round(len(book_content)/2)):
            # searching for the last * START in the first half of the file 
            if re.match(r'\*+\s*START ', book_content[index_num]):
                start_index = index_num+1 
                
            # searching for the last * END from the back, in the last half of the file 
            if re.match(r'\*+\s*END ', book_content[-index_num]):
                stop_index = -index_num-1 

        # slicing the section of the text between the start_index and stop_index. 
        clean_book_content = book_content[start_index:stop_index]
        
        # join all the text without "\r\n" i.e. return carriage and newline 
        __clean_book_content = " ".join([l.strip("\r\n") for l in clean_book_content if l != "\r\n"])
        
        # splits the text into sentences using the TextBlob package, it in turns calls on the nltk package 
        textblob_sentsegs = [i.string for i in TextBlob(__clean_book_content).sentences]
        
        # strip to first and last 5 lines (as a buffer to avoid collecting overflow PG metadata)
        textblob_sentsegs = textblob_sentsegs[5:-5]

        return textblob_sentsegs

    

    def write_to_file(save_datapath):
        '''
        takes the list of sentences from a single book of an author (this is the value of the dictionary that is nested under an author's number in 
        self.processed_subcorpus) and writes it to a text file. 

        '''


        pass
    
    def _get_authorabstract(self, languages, author_wikiname):
        '''
        A helper function for .populate_attributes. takes the author's wikiname passes it through the wikipedia package to
        retrieve the abstract of the author's wiki page for each of the languages passes. 
        input | 
        1. languages: list - a list of languages (use language prefixes listed here 
        https://meta.wikimedia.org/wiki/List_of_Wikipedias)
        2. author_wikiname: str - name of the author, obtained from the wikipedia links posted on the PG website
        output | a list, containing the abstracts for an author in the order of the languages passed into the argument. 

        '''
        try:
            # assert 
            assert len(self.wiki_info) > 0
            
            __abstracts = []
            for lang in languages: 
                wikipedia.set_lang(lang)

            try: 
                # without disambiguation 
                wikipage = wikipedia.page(title=self.name)
                self.authorabstract = wikipage.summary()


            except: 
                # use wikipedia disambiguation to 
                possible_pages = wikipedia.exceptions.DisambiguationError(title=self.name)

                # check that the books collected for the author,
        except AssertionError:
            self.authorabstract = "Project Gutenberg does not list any wikipedia pages for the author". 
    
    
    def _get_literarymovement(self):
        '''
        A helper function for .populate_attributes. takes an author's name, makes a DBpedia query 
        with the name using the SPARQLWrapper package, 
        returns the literary movements that the author is associated with. 

        input | 
        output | 
        '''

        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery("""
        SELECT ?writer ?name ?genre
        WHERE {
        ?writer rdf:type dbo:Writer ;
        foaf:name ?name .
        ?writer dbo:genre ?genre .
        }
        """)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        for result in results["results"]["bindings"]:
            genre=result["genre"]["value"]

        return "pending"


### 3. pymongo implementation to store the corpus. 


In [None]:

# instantiate a MongoClient object. using the URI for the Mongo server. If it is locally hosted, 
# it is by default on the 27017 port. If using cloud, use the provided URI
client = MongoClient('mongodb://localhost:27017/')

corpusdb = client["corpus"]
authorcollection = corpusdb["author"]

# insert the documents into the collection 
for i in PGcorpus.authors: 
    authorcollection.insert_one(PGcorpus.authors[i])

# some test code to check insertions 
print(authorcollection.find_one(), authorcollection.
authorcollection.estimated_document_count()

### 4. Code execution

In [None]:
if __name__ == "__main__":
  # instantiate a GutenbergCorpusBuilder 
  PGcorpus = GutenbergCorpusBuilder(corpusname="PG-eng-author-min2")
  # start collecting and filtering author and book details from the Project Gutenberg site
  PGcorpus.get_library(min_books = 2, languages = ["english"], roles = ["as author"])
  # read text files, select sentences, pre-process sentences, store to subcorpora
#   PGcorpus.populate_corpus()
  # write subcorpora to file 
#   PGCorpus.write_to_file()
  
  # import to mongoDB. export mongoDB database. 
  

### 5. Test code - informal

In [None]:
# check that corpus contains only english books. it should return nothing. 
for i in PGcorpus.authors.keys(): 
    for i2 in PGcorpus.authors[i]["books_info"]:
        if "English" not in PGcorpus.authors[i]["books_info"][i2]["PG_booktitle"]:
            print(PGcorpus.authors[i]["books_info"][i2]["PG_booktitle"])

# check that corpus contains only books where author role is as Author. it should return nothing.
for i in PGcorpus.authors.keys(): 
    for i2 in PGcorpus.authors[i]["books_info"]:
        if "Author" not in PGcorpus.authors[i]["books_info"][i2]["PG_booktitle"]:
            print(PGcorpus.authors[i]["books_info"][i2]["PG_booktitle"])