In [1]:
import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
from ebooklib import epub
import ebooklib
from bs4 import BeautifulSoup
import re
import itertools
import matplotlib.pyplot as plt
import numpy as np
from contractions import CONTRACTION_MAP
import unicodedata
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [3]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


In [4]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [5]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


In [6]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [7]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [8]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


In [9]:
def normalize_corpus(corpus,contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=False, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [11]:
##Way of Kings processing
book = epub.read_epub('wok.epub')

toc = book.get_item_with_id('con01')

toc_pageSource = toc.get_content().decode('utf-8')

soup = BeautifulSoup(toc_pageSource, 'html.parser')

book1_chapters = { file['id'].replace('a',''):{'chapterName':re.sub(".*\: ","",file.text)} for file in soup.find_all('a') if 'id' in file.attrs  }

for k in ['pt01','pt02','pt03','pt04','pt05','pt06','pt07','pt08','pt09',]:
    book1_chapters.pop(k,None)

for chId in book1_chapters:
    item = book.get_item_with_id(chId)
    pageSource = item.get_content().decode('utf-8')
    soup = BeautifulSoup(pageSource, 'html.parser')
    book1_chapters[chId]['chapterHead'] = [para.text.strip()+'. ' for para in soup.find('div',attrs={'class':'chapterHead'}).find_all('p')]
    book1_chapters[chId]['chapterBody'] = [para.text.strip()+'. ' for para in soup.find('div',attrs={'class':'chapterBody'}).find_all('p')]
    print(chId,len(book1_chapters[chId]['chapterHead']),len(book1_chapters[chId]['chapterBody']))
    

fm02 0 45
ch01 3 129
ch02 3 142
ch03 3 143
ch04 3 123
ch05 3 113
ch06 3 134
ch07 3 221
ch08 3 186
ch09 3 236
ch10 3 67
ch11 1 73
ch12 3 112
ch13 1 67
ch14 1 32
ch15 1 73
ch16 2 254
ch17 2 118
ch18 2 142
ch19 2 312
ch20 1 189
ch21 2 210
ch22 2 293
ch23 2 193
ch24 1 30
ch25 2 183
ch26 2 183
ch27 2 209
ch28 2 132
ch29 1 114
ch30 2 191
ch31 2 306
ch32 2 330
ch33 1 88
ch34 1 69
ch35 1 83
ch36 3 252
ch37 3 121
ch38 1 61
ch39 3 156
ch40 3 201
ch41 3 74
ch42 3 46
ch43 3 179
ch44 1 229
ch45 3 63
ch46 3 87
ch47 3 173
ch48 1 81
ch49 3 205
ch50 3 176
ch51 1 173
ch52 3 273
ch53 3 254
ch54 1 165
ch55 3 138
ch56 3 111
ch57 3 52
ch58 1 72
ch59 1 48
ch60 1 59
ch61 1 79
ch62 3 233
ch63 3 82
ch64 3 188
ch65 3 192
ch66 3 118
ch67 3 327
ch68 3 232
ch69 3 229
ch70 3 150
ch71 3 130
ch72 3 169
ch73 3 81
ch74 3 112
ch75 3 104
ch76 3 69
ch77 3 287
ch78 3 204
ch79 3 258
ch80 1 85
ch81 1 86
ch82 1 42
ch83 1 149
ch84 1 31
ch85 1 70
bm01 1 49
end01 1 9
bm02 0 31


In [60]:
##Words of Radiants processing
book = epub.read_epub('wor.epub')

toc = book.get_item_with_id('toc')

toc_pageSource = toc.get_content().decode('utf-8')

soup = BeautifulSoup(toc_pageSource, 'html.parser')

book2_chapters = { re.sub(r'xhtml/(chapter\d+|inter\d+-\d+|epilogue|ars|endnotes|prologue).html.*','\g<1>',t['src']):{ 'chapterName': re.sub('\d+\. (.*)','\g<1>',t.find_parent().text ) } for t in soup.find_all('content') if re.match(r'xhtml/(chapter\d+|inter\d+-\d+|epilogue|ars|endnotes|prologue).html.*',t['src'])}

for chId in book2_chapters:
    item = book.get_item_with_id(chId)
    
    pageSource = item.get_content().decode('utf-8')
    soup = BeautifulSoup(pageSource, 'html.parser')
    cep  = soup.find('p',attrs={'class':'CEP'}).text.strip() if ( soup.find('p',attrs={'class':'CEP'}) ) else ''
    cepc =  soup.find('p',attrs={'class':'CEPC'}).text.strip() if ( soup.find('p',attrs={'class':'CEP'}) ) else ''
    book2_chapters[chId]['chapterHead'] = cep+'. '+cepc 
    co = soup.find('p',attrs={'class':'CO'}).text.strip() if ( soup.find('p',attrs={'class':'CO'}) ) else ''
    book2_chapters[chId]['chapterBody'] =   [co+'. '] + [para.text.strip()+'. ' for para in soup.find_all('p',attrs={'class':'TX'})]
    print(chId,len(book2_chapters[chId]['chapterHead']),len(book2_chapters[chId]['chapterBody']))

    
    



prologue 2 195
chapter1 252 248
chapter2 266 165
chapter3 350 79
chapter4 157 152
chapter5 168 285
chapter6 177 163
chapter7 257 113
chapter8 292 77
chapter9 226 121
chapter10 2 16
chapter11 294 162
chapter12 306 175
inter1-1 2 100
inter1-2 2 127
inter1-3 2 279
inter1-4 2 155
chapter13 2 111
chapter14 2 117
chapter15 2 78
chapter16 2 165
chapter17 2 187
chapter18 2 166
chapter19 2 56
chapter20 2 103
chapter21 2 89
chapter22 2 184
chapter23 2 66
chapter24 2 156
chapter25 2 171
chapter26 2 132
chapter27 2 30
chapter28 2 124
chapter29 2 97
chapter30 2 98
chapter31 2 93
chapter32 2 147
chapter33 2 119
chapter34 2 105
inter2-1 2 43
inter2-2 2 21
inter2-3 2 58
inter2-4 2 22
chapter35 474 138
chapter36 570 175
chapter37 454 94
chapter38 315 182
chapter39 2 118
chapter40 306 88
chapter41 528 256
chapter42 377 87
chapter43 472 154
chapter44 441 232
chapter45 2 247
chapter46 393 266
chapter47 235 116
chapter48 2 162
chapter49 307 232
chapter50 474 92
chapter51 443 93
chapter52 476 286
chapter53 

In [13]:
corpus = []
for chapter in chapters:
    corpus.append(chapters[chapter]['chapterName'])
    corpus.extend(chapters[chapter]['chapterHead'])
    corpus.extend(chapters[chapter]['chapterBody'])
    
clean_corpus = normalize_corpus(corpus)


560
The crowds thinned as they reached the upper quarter of the city, and eventually her porter pulled her to a massive building at the very apex of the city. Painted white, it was carved from the rock face itself, rather than built of bricks or clay. The pillars out front grew seamlessly from the stone, and the back side of the building melded smoothly into the cliff. The outcroppings of roof had squat domes atop them, and were painted in metallic colors. Lighteyed women passed in and out, carrying scribing utensils and wearing dresses like Shallan’s, their left hands properly cuff ed. The men entering or leaving the building wore military-style Vorin coats and stiff trousers, buttons up the sides and ending in a stiff collar that wrapped the entire neck. Many carried swords at their waists, the belts wrapping around the knee-length coats.. 
crowd thin reach upper quarter city eventually porter pull massive building apex city paint white carve rock face rather build brick clay pillar 

In [19]:
tokenised_sentences = [ sent.split() for sent in clean_corpus ]
vocab = list(set(sorted([ tok for sent in tokenised_sentences for tok in sent])))
print( "|N|="+str(len(tokenised_sentences))+"|V|="+str(len(vocab)) )


|N|=13052|V|=11962
