# session 7 analyse AWOIF Characters pages

To analyze the pages we need to 

1. Read all the html and create a DOM tree with
2. Look inside teh DOM tree to find the infobox
3. inside the infobox get the various fields that we need including
  * full name (title of the page)
  * caption name (title of the infobox)
  * aliases
  * books
  * titles
  * infobox items count (field of the infobox)
  * role (how character appears in the book, books field)
  * role score (score of the character role) 
  * page size (how many characters in the page) 
  * links of the page  

if no infobox is found, the character is discarded

The code use a variety of technics to extract, clean and array each field Biggest technical difficulty is the cleaning code:

_aliases, title and books in the format 'BranStark (appears)[N 1]' that needs to be transformed into ['Bran', 'Sark']. This is achieved with regular expressions matching certain patterns in the text. An example of the regex used in this example is available on regex101.com_

_books volume title is converted into a number with the use of a dictionary_

As usual we begin by declaring the various helper libraries we might need to achieve the analysis.
Typically we will need DOM Tree helper *BeautifulSoup*, regular expressions *re* and directory listing *os* as well as a self define dictionary

In [51]:
from bs4 import BeautifulSoup
import re
import os

books_number ={'the world of ice & fire':0,'a game of thrones':1,
    'a clash of kings':2,
    'a storm of swords':3,
    'a feast for crows':4,
    'a dance with dragons':5}

role_scoring = {'Appears':0.8,
 'Mentioned':0.5,
 'POV':1,
 'apendix':0.2,
 'appears':0.8,
 'appedix':0.2,
 'appendi':0.2,
 'appendix':0.2,
 'mentioned':0.5,
 'mentions':0.5}

### infobox class
The class helps packing all the useful extraction in a neat package. Self is a self reference to the class itself, needed to access inside data between functions. 

In [43]:
class Record:
    def __init__(self, html):
        self.name = None
        self.fullname = None
        self.titles = None
        self.aliases = None
        self.books = None
        self.roleInBook = None
        self.roleScore = None
        self.pageSize = None
        self.links = []
        self.infoboxSize = 0
        self.soup = BeautifulSoup(html,'html5lib')
    
        self.pageSize = len(self.soup.text)
    
    def informations(self):
        return {'name':self.name,
                'fullname':self.fullname,
                'titles':self.titles, 
                'aliases':self.aliases,
                'books':self.books, 
                'roleInBooks':self.roleInBook,
               'pageSize':self.pageSize}
    
    def getLinks(self):
        global names
        cLinks = []
        links = soup.find_all('a')
        for link in links:
            if 'href' in link.attrs.keys():
                name = link['href'].split('/')[-1]
                if name in names:
                    self.links.append(name)

    
    def clean_and_split_list_like_cell(self,llcell, books=False):
        # get PARENTHESIS content if it is parenthesis and books
        if books:
            regex = r"\((?P<role>\w+)\)"
            matches = re.finditer(regex, llcell, re.MULTILINE)
            roleInBook = [match.group('role') for match in matches]
            if len(roleInBook)>0:
                self.roleInBook = roleInBook
            
            regex = r"\(\w+\)" # REMOVE PARENTHESIS 
            llcell = re.split(regex,llcell)
            return llcell
            
        else:
            regex = r"\s?\([\w\s]+\)" # REMOVE PARENTHESIS 
            llcell = re.sub(regex,'',llcell, 0, re.MULTILINE)
            
            regex = r"\[[\w\s]+\]" # REMOVE REFERENCES
            llcell = re.sub(regex,'',llcell, 0, re.MULTILINE)

            regex = r"([a-z])([A-Z])" # SPLIT lowerUPPER
            subst = "\\1|\\2"
            llcell = re.sub(regex, subst, llcell, 0, re.MULTILINE).split('|')
            llcell = [i.strip() for i in llcell if len(i.strip())>0]
            return llcell
    
    def getBooks(self,value):
        books = self.clean_and_split_list_like_cell(value, books=True)
        books = [books_number[b.strip().lower()] if len(b.strip())>0 and b.strip().lower() in books_number.keys() else 0 for b in books ]
        if len(books) > 0:
            self.books = books
    
    def getName(self):
        try:
            self.name = self.infobox.caption.get_text().strip()
        except AttributeError:
            pass
        
    def getFullName(self, value):
        self.fullname = value
        
    def getAliases(self,value):
        aliases = self.clean_and_split_list_like_cell(value)
        if len(aliases)>0:
            self.aliases = aliases
        
    def getTitles(self,value):
        titles = self.clean_and_split_list_like_cell(value)
        if len(titles)>0:
            self.titles = titles
    
    def analyze_infobox(self):
        try:
            self.infobox = self.soup.find_all("table", class_="infobox")[0]
        except IndexError:
            return
        
        self.getName()
        try:
            trs = self.infobox.find_all('tr')
        except:
            return 
        
        for tr in trs:
            try:
                key = tr.th.get_text().strip().lower()
                value = tr.td.get_text().strip()
            except AttributeError:
                continue
                
            if 'full name' in key:
                self.getFullName(value)
                self.infoboxSize += len(self.fullname)
            elif 'alias' in key:
                self.getAliases(value)
                self.infoboxSize += len(self.aliases)
            elif 'title' in key:
                self.getTitles(value)
                self.infoboxSize += len(self.titles)
            elif 'book' in key:
                self.getBooks(value)
                self.infoboxSize += len(self.books)
            else:
                self.infoboxSize += len(self.clean_and_split_list_like_cell(value))

In [46]:
records = []
names = []
for file in os.listdir(path='.'):
    if file.endswith('.html'):
        with open(file) as fp:
            names.append(file.split('.')[0])
            html = fp.read()
            record = Record(html)
            record.analyze_infobox()
            records.append(record.informations())

In [50]:
role = set()
for record in records:
    try:
        for r in record['roleInBooks']:
            role.add(r)
    except:
        pass
role

{'Appears',
 'Mentioned',
 'POV',
 'apendix',
 'appears',
 'appedix',
 'appendi',
 'appendix',
 'mentioned',
 'mentions'}

[]
