In [1]:
import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
from ebooklib import epub
import ebooklib
from bs4 import BeautifulSoup
import re
import itertools
import matplotlib.pyplot as plt
import numpy as np
from contractions import CONTRACTION_MAP
import unicodedata
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

In [2]:
nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [3]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


In [4]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [5]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


In [6]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [7]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [8]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


In [35]:
def normalize_corpus(corpus,contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=False, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    progress=0
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        progress+=1
        if progress%500==0:
            print(progress)
        
    return normalized_corpus

In [36]:
##Way of Kings processing
book = epub.read_epub('wok.epub')

toc = book.get_item_with_id('con01')

toc_pageSource = toc.get_content().decode('utf-8','ignore')

soup = BeautifulSoup(toc_pageSource, 'html.parser')

book1_chapters = { file['id'].replace('a',''):{'chapterName':re.sub(".*\: ","",file.text)} for file in soup.find_all('a') if 'id' in file.attrs  }

for k in ['pt01','pt02','pt03','pt04','pt05','pt06','pt07','pt08','pt09',]:
    book1_chapters.pop(k,None)

for chId in book1_chapters:
    item = book.get_item_with_id(chId)
    pageSource = item.get_content().decode('utf-8')
    soup = BeautifulSoup(pageSource, 'html.parser')
    book1_chapters[chId]['chapterHead'] = [para.text.strip()+'. ' for para in soup.find('div',attrs={'class':'chapterHead'}).find_all('p')]
    book1_chapters[chId]['chapterBody'] = [para.text.strip()+'. ' for para in soup.find('div',attrs={'class':'chapterBody'}).find_all('p')]
    print(chId,len(book1_chapters[chId]['chapterHead']),len(book1_chapters[chId]['chapterBody']))
    

fm02 0 45
ch01 3 129
ch02 3 142
ch03 3 143
ch04 3 123
ch05 3 113
ch06 3 134
ch07 3 221
ch08 3 186
ch09 3 236
ch10 3 67
ch11 1 73
ch12 3 112
ch13 1 67
ch14 1 32
ch15 1 73
ch16 2 254
ch17 2 118
ch18 2 142
ch19 2 312
ch20 1 189
ch21 2 210
ch22 2 293
ch23 2 193
ch24 1 30
ch25 2 183
ch26 2 183
ch27 2 209
ch28 2 132
ch29 1 114
ch30 2 191
ch31 2 306
ch32 2 330
ch33 1 88
ch34 1 69
ch35 1 83
ch36 3 252
ch37 3 121
ch38 1 61
ch39 3 156
ch40 3 201
ch41 3 74
ch42 3 46
ch43 3 179
ch44 1 229
ch45 3 63
ch46 3 87
ch47 3 173
ch48 1 81
ch49 3 205
ch50 3 176
ch51 1 173
ch52 3 273
ch53 3 254
ch54 1 165
ch55 3 138
ch56 3 111
ch57 3 52
ch58 1 72
ch59 1 48
ch60 1 59
ch61 1 79
ch62 3 233
ch63 3 82
ch64 3 188
ch65 3 192
ch66 3 118
ch67 3 327
ch68 3 232
ch69 3 229
ch70 3 150
ch71 3 130
ch72 3 169
ch73 3 81
ch74 3 112
ch75 3 104
ch76 3 69
ch77 3 287
ch78 3 204
ch79 3 258
ch80 1 85
ch81 1 86
ch82 1 42
ch83 1 149
ch84 1 31
ch85 1 70
bm01 1 49
end01 1 9
bm02 0 31


In [37]:
##Words of Radiants processing
book = epub.read_epub('wor.epub')

toc = book.get_item_with_id('toc')

toc_pageSource = toc.get_content().decode('utf-8')

soup = BeautifulSoup(toc_pageSource, 'html.parser')

book2_chapters = { re.sub(r'xhtml/(chapter\d+|inter\d+-\d+|epilogue|ars|prologue).html.*','\g<1>',t['src']):{ 'chapterName': re.sub('\d+\. (.*)','\g<1>',t.find_parent().text ) } for t in soup.find_all('content') if re.match(r'xhtml/(chapter\d+|inter\d+-\d+|epilogue|ars|prologue).html.*',t['src']) and 'image' not in t['src']}

for chId in book2_chapters:
    item = book.get_item_with_id(chId)
    pageSource = item.get_content().decode('utf-8')
    soup = BeautifulSoup(pageSource, 'html.parser')
    cep  = soup.find('p',attrs={'class':'CEP'}).text.strip() if ( soup.find('p',attrs={'class':'CEP'}) ) else ''
    cepc =  soup.find('p',attrs={'class':'CEPC'}).text.strip() if ( soup.find('p',attrs={'class':'CEP'}) ) else ''
    book2_chapters[chId]['chapterHead'] = cep+'. '+cepc 
    co = soup.find('p',attrs={'class':'CO'}).text.strip() if ( soup.find('p',attrs={'class':'CO'}) ) else ''
    book2_chapters[chId]['chapterBody'] =   [co+'. '] + [para.text.strip()+'. ' for para in soup.find_all('p')]
    print(chId,len(book2_chapters[chId]['chapterHead']),len(book2_chapters[chId]['chapterBody']))


prologue 2 198
chapter1 252 252
chapter2 266 171
chapter3 350 86
chapter4 157 158
chapter5 168 294
chapter6 177 167
chapter7 257 117
chapter8 292 81
chapter9 226 125
chapter10 2 19
chapter11 294 171
chapter12 306 183
inter1-1 2 102
inter1-2 2 129
inter1-3 2 283
inter1-4 2 157
chapter13 2 118
chapter14 2 125
chapter15 2 85
chapter16 2 172
chapter17 2 195
chapter18 2 173
chapter19 2 59
chapter20 2 110
chapter21 2 96
chapter22 2 194
chapter23 2 75
chapter24 2 165
chapter25 2 180
chapter26 2 143
chapter27 2 33
chapter28 2 133
chapter29 2 106
chapter30 2 108
chapter31 2 104
chapter32 2 160
chapter33 2 134
chapter34 2 116
inter2-1 2 45
inter2-2 2 23
inter2-3 2 61
inter2-4 2 24
chapter35 474 145
chapter36 570 181
chapter37 454 98
chapter38 315 186
chapter39 2 121
chapter40 306 92
chapter41 528 262
chapter42 377 91
chapter43 472 158
chapter44 441 236
chapter45 2 251
chapter46 393 270
chapter47 235 120
chapter48 2 165
chapter49 307 237
chapter50 474 98
chapter51 443 99
chapter52 476 310
chapter

In [38]:
corpus = []
for chapter in book1_chapters:
    corpus.append(book1_chapters[chapter]['chapterName'])
    corpus.extend(book1_chapters[chapter]['chapterHead'])
    corpus.extend(book1_chapters[chapter]['chapterBody'])
    
for chapter in book2_chapters:
    corpus.append(book2_chapters[chapter]['chapterName'])
    corpus.extend(book2_chapters[chapter]['chapterHead'])
    corpus.extend(book2_chapters[chapter]['chapterBody'])

print(len(corpus),corpus[45150:45159])

45159 ['FULL LASHING: BINDING OBJECTS TOGETHER. ', 'A Full Lashing might seem very similar to a Basic Lashing, but they worked on very different principles. While one had to do with gravitation, the other had to do with the force (or Surge, as the Radiants called them) of Adhesion—binding objects together as if they were one. I believe this Surge may have had something to do with atmospheric pressure.. ', 'To create a Full Lashing, a Windrunner would infuse an object with Stormlight, then press another object to it. The two objects would become bound together with an extremely powerful bond, nearly impossible to break. In fact, most materials would themselves break before the bond holding them together would.. ', 'REVERSE LASHING: GIVING AN OBJECT A GRAVITATIONAL PULL. ', 'I believe this may actually be a specialized version of the Basic Lashing. This type of Lashing required the least amount of Stormlight of any of the three Lashings. The Windrunner would infuse something, give a ment

In [39]:
clean_corpus = normalize_corpus(corpus)
print(len(clean_corpus),clean_corpus[45150:45159])

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45159 ['full lashing binding object together', 'full lashing may seem similar basic lashing work different principle one gravitation force surge radiants call adhesionbinding object together one believe surge may something atmospheric pressure', 'create full lashing windrunner would infuse object stormlight press another object two object would become bind together extremely powerful bond nearly impossible break fact material would break bond hold together would', 'reverse la

In [40]:
tokenised_sentences = [ sent.split() for sent in clean_corpus ]
vocab = list(set(sorted([ tok for sent in tokenised_sentences for tok in sent])))
print( "|N|="+str(len(tokenised_sentences))+"|V|="+str(len(vocab)) )


|N|=45159|V|=16738
