In [1]:
import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
from ebooklib import epub
import ebooklib
from bs4 import BeautifulSoup
import re
import itertools
import matplotlib.pyplot as plt
import numpy as np
from contractions import CONTRACTION_MAP
import unicodedata
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

In [2]:
nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [3]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


In [4]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [5]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


In [6]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [7]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [8]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


In [9]:
def normalize_corpus(corpus,contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=False, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    progress=0
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        progress+=1
        if progress%500==0:
            print(progress)
        
    return normalized_corpus

In [10]:
##Way of Kings processing
book = epub.read_epub('wok.epub')

toc = book.get_item_with_id('con01')

toc_pageSource = toc.get_content().decode('utf-8','ignore')

soup = BeautifulSoup(toc_pageSource, 'html.parser')

book1_chapters = { file['id'].replace('a',''):{'chapterName':re.sub(".*\: ","",file.text)} for file in soup.find_all('a') if 'id' in file.attrs  }

for k in ['pt01','pt02','pt03','pt04','pt05','pt06','pt07','pt08','pt09',]:
    book1_chapters.pop(k,None)

for chId in book1_chapters:
    item = book.get_item_with_id(chId)
    pageSource = item.get_content().decode('utf-8')
    soup = BeautifulSoup(pageSource, 'html.parser')
    book1_chapters[chId]['chapterHead'] = [para.text.strip()+'. ' for para in soup.find('div',attrs={'class':'chapterHead'}).find_all('p')]
    book1_chapters[chId]['chapterBody'] = [para.text.strip()+'. ' for para in soup.find('div',attrs={'class':'chapterBody'}).find_all('p')]
    print(chId,len(book1_chapters[chId]['chapterHead']),len(book1_chapters[chId]['chapterBody']))
    

fm02 0 45
ch01 3 129
ch02 3 142
ch03 3 143
ch04 3 123
ch05 3 113
ch06 3 134
ch07 3 221
ch08 3 186
ch09 3 236
ch10 3 67
ch11 1 73
ch12 3 112
ch13 1 67
ch14 1 32
ch15 1 73
ch16 2 254
ch17 2 118
ch18 2 142
ch19 2 312
ch20 1 189
ch21 2 210
ch22 2 293
ch23 2 193
ch24 1 30
ch25 2 183
ch26 2 183
ch27 2 209
ch28 2 132
ch29 1 114
ch30 2 191
ch31 2 306
ch32 2 330
ch33 1 88
ch34 1 69
ch35 1 83
ch36 3 252
ch37 3 121
ch38 1 61
ch39 3 156
ch40 3 201
ch41 3 74
ch42 3 46
ch43 3 179
ch44 1 229
ch45 3 63
ch46 3 87
ch47 3 173
ch48 1 81
ch49 3 205
ch50 3 176
ch51 1 173
ch52 3 273
ch53 3 254
ch54 1 165
ch55 3 138
ch56 3 111
ch57 3 52
ch58 1 72
ch59 1 48
ch60 1 59
ch61 1 79
ch62 3 233
ch63 3 82
ch64 3 188
ch65 3 192
ch66 3 118
ch67 3 327
ch68 3 232
ch69 3 229
ch70 3 150
ch71 3 130
ch72 3 169
ch73 3 81
ch74 3 112
ch75 3 104
ch76 3 69
ch77 3 287
ch78 3 204
ch79 3 258
ch80 1 85
ch81 1 86
ch82 1 42
ch83 1 149
ch84 1 31
ch85 1 70
bm01 1 49
end01 1 9
bm02 0 31


In [11]:
##Words of Radiants processing
book = epub.read_epub('wor.epub')

toc = book.get_item_with_id('toc')

toc_pageSource = toc.get_content().decode('utf-8')

soup = BeautifulSoup(toc_pageSource, 'html.parser')

book2_chapters = { re.sub(r'xhtml/(chapter\d+|inter\d+-\d+|epilogue|ars|prologue).html.*','\g<1>',t['src']):{ 'chapterName': re.sub('\d+\. (.*)','\g<1>',t.find_parent().text ) } for t in soup.find_all('content') if re.match(r'xhtml/(chapter\d+|inter\d+-\d+|epilogue|ars|prologue).html.*',t['src']) and 'image' not in t['src']}

for chId in book2_chapters:
    item = book.get_item_with_id(chId)
    pageSource = item.get_content().decode('utf-8')
    soup = BeautifulSoup(pageSource, 'html.parser')
    cep  = soup.find('p',attrs={'class':'CEP'}).text.strip() if ( soup.find('p',attrs={'class':'CEP'}) ) else ''
    cepc =  soup.find('p',attrs={'class':'CEPC'}).text.strip() if ( soup.find('p',attrs={'class':'CEPC'}) ) else ''
    book2_chapters[chId]['chapterHead'] = cep+'. '+cepc 
    co = soup.find('p',attrs={'class':'CO'}).text.strip() if ( soup.find('p',attrs={'class':'CO'}) ) else ''
    book2_chapters[chId]['chapterBody'] =   [co+'. '] + [para.text.strip()+'. ' for para in soup.find_all('p')]
    print(chId,len(book2_chapters[chId]['chapterHead']),len(book2_chapters[chId]['chapterBody']))


prologue 2 198
chapter1 252 252
chapter2 266 171
chapter3 350 86
chapter4 157 158
chapter5 168 294
chapter6 177 167
chapter7 257 117
chapter8 292 81
chapter9 226 125
chapter10 2 19
chapter11 294 171
chapter12 306 183
inter1-1 2 102
inter1-2 2 129
inter1-3 2 283
inter1-4 2 157
chapter13 49 118
chapter14 48 125
chapter15 49 85
chapter16 49 172
chapter17 50 195
chapter18 49 173
chapter19 2 59
chapter20 49 110
chapter21 49 96
chapter22 46 194
chapter23 49 75
chapter24 49 165
chapter25 52 180
chapter26 46 143
chapter27 2 33
chapter28 49 133
chapter29 51 106
chapter30 51 108
chapter31 49 104
chapter32 46 160
chapter33 47 134
chapter34 50 116
inter2-1 2 45
inter2-2 2 23
inter2-3 2 61
inter2-4 2 24
chapter35 474 145
chapter36 570 181
chapter37 454 98
chapter38 315 186
chapter39 2 121
chapter40 306 92
chapter41 528 262
chapter42 377 91
chapter43 472 158
chapter44 441 236
chapter45 2 251
chapter46 393 270
chapter47 235 120
chapter48 2 165
chapter49 307 237
chapter50 474 98
chapter51 443 99
chapt

In [12]:
##Oathbringer processing
book = epub.read_epub('oath.epub')

toc = book.get_item_with_id('toc')

toc_pageSource = toc.get_content().decode('utf-8')

soup = BeautifulSoup(toc_pageSource, 'html.parser')

book3_chapters = { re.sub(r'xhtml/(chapter\d+|int\d+|epilogue|ars|prologue|ill11|ill20|ill22|ill16|ill9|ill17|ill14|ill15|ill18|backmatter).xhtml.*','\g<1>',t.find('content')['src']): { 'chapterName' : re.sub( '.*\. (.*)','\g<1>', t.find('text').text )}  for t in soup.find_all('navpoint') if re.match(r'xhtml/(chapter\d+|int\d+|epilogue|ars|prologue|ill11|ill20|ill16|ill9|ill17|ill14|ill15|ill22«ill18|backmatter).xhtml.*',t.find('content')['src']) and 'image' not in t.find('content')['src']}

for chId in book3_chapters:
    item = book.get_item_with_id(chId)
    pageSource = item.get_content().decode('utf-8')
    soup = BeautifulSoup(pageSource, 'html.parser')
    cep  = soup.find('p',attrs={'class':'EP'}).text.strip() if ( soup.find('p',attrs={'class':'EP'}) ) else ''
    cepc = soup.find('p',attrs={'class':'EPC'}).text.strip() if ( soup.find('p',attrs={'class':'EPC'}) ) else ''
    co = soup.find('p',attrs={'class':'CO'}).text.strip() if ( soup.find('p',attrs={'class':'CO'}) ) else ''
    
    book3_chapters[chId]['chapterBody'] =   [ co+'. ' ] + [ para.text.strip()+'. ' for para in soup.find_all('p') ]
    book3_chapters[chId]['chapterHead'] = cep+'. '+cepc 
    
    print(chId,len(book3_chapters[chId]['chapterHead']),len(book3_chapters[chId]['chapterBody']))


prologue 2 101
chapter1 158 83
chapter2 56 112
chapter3 2 139
chapter4 145 178
chapter5 191 78
chapter6 126 85
chapter7 42 194
chapter8 124 118
chapter9 96 49
chapter10 116 170
chapter11 2 169
chapter12 88 202
chapter13 78 146
chapter14 161 36
chapter15 139 118
chapter16 57 232
chapter17 148 115
chapter18 101 254
chapter19 2 145
chapter20 154 54
chapter21 87 152
chapter22 89 86
chapter23 79 63
chapter24 161 139
chapter25 124 135
chapter26 2 166
chapter27 140 126
chapter28 171 177
chapter29 99 237
chapter30 58 37
chapter31 98 185
chapter32 63 38
int1 2 25
int2 2 71
int3 2 68
chapter33 2 67
chapter34 2 165
chapter35 2 157
chapter36 2 79
chapter37 2 259
chapter38 2 219
ill9 2 2
chapter39 2 137
chapter40 2 133
chapter41 2 58
chapter42 2 92
chapter43 2 77
chapter44 2 164
chapter45 2 76
chapter46 2 153
chapter47 2 71
chapter48 2 61
chapter49 2 178
chapter50 2 139
chapter51 2 78
chapter52 2 102
ill11 2 2
chapter53 2 143
chapter54 2 116
chapter55 2 104
chapter56 2 77
chapter57 2 98
int4 2 108


In [13]:
corpus = []
for chapter in book1_chapters:
    corpus.append(book1_chapters[chapter]['chapterName'])
    corpus.extend(book1_chapters[chapter]['chapterHead'])
    corpus.extend(book1_chapters[chapter]['chapterBody'])
    
for chapter in book2_chapters:
    corpus.append(book2_chapters[chapter]['chapterName'])
    corpus.extend(book2_chapters[chapter]['chapterHead'])
    corpus.extend(book2_chapters[chapter]['chapterBody'])

for chapter in book3_chapters:
    corpus.append(book3_chapters[chapter]['chapterName'])
    corpus.extend(book3_chapters[chapter]['chapterHead'])
    corpus.extend(book3_chapters[chapter]['chapterBody'])
    


less_clean_corpus = list( map( lambda sent: sent.replace('\u200b', '')
                                            .replace('\xa0','')
                                            .replace('ç','')
                                            .replace('\n', '')
                                            .replace('•', '')
                                            .replace('…', '')
                                            .replace('/','')
                                            .replace('(','')
                                            .replace(')','')
                                            .replace(' ', ' ') , corpus) )

print(len(less_clean_corpus),less_clean_corpus[78820:78828])

78828 ['A second form of Surgebinding involves the manipulation of light and sound in illusory tactics common throughout the cosmere. Unlike the variations present on Sel, however, this method has a powerful Spiritual element, requiring not just a full mental picture of the intended creation, but some level of Connection to it as well. The illusion is based not simply upon what the Lightweaver imagines, but upon what they desire to create.. ', 'In many ways, this is the most similar ability to the original Yolish variant, which excites me. I wish to delve more into this ability, with the hope to gain a full understanding of how it relates to cognitive and spiritual attributes.. ', 'SOULCASTING. ', 'Essential to the economy of Roshar is the art of Soulcasting, in which one form of matter is directly transformed into another by changing its spiritual nature. This is performed on Roshar via the use of devices known as Soulcasters, and these devices the majority of which appear to be focus

In [14]:
char_corpus = [c for sent in less_clean_corpus for c in sent]

print(len(char_corpus))

char_to_num = { char:i for i,char in  enumerate(sorted(set(char_corpus))) }

num_to_char = dict(zip(char_to_num.values(),char_to_num.keys()))

SEQ_LENGTH = 50

VOCAB_SIZE = len(char_to_num.keys())

7183548


In [15]:
X = np.zeros((int(len(char_corpus)/SEQ_LENGTH), SEQ_LENGTH, VOCAB_SIZE))
y = np.zeros((int(len(char_corpus)/SEQ_LENGTH), SEQ_LENGTH, VOCAB_SIZE))
for i in range(0, int( len(char_corpus)/SEQ_LENGTH) ):
    X_sequence = char_corpus[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_to_num[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = char_corpus[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_to_num[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

In [16]:
X.shape

(143670, 50, 77)

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed

HIDDEN_DIM = 300
LAYER_NUM = 2
GENERATE_LENGTH = 1000
BATCH_SIZE = 200
MAX_EPOCHS = 100

model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")


def generate_text(model, length):
    ix = [np.random.randint(VOCAB_SIZE)]
    y_char = [num_to_char[ix[-1]]]
    X = np.zeros((1, length, VOCAB_SIZE))
    for i in range(length):
        X[0, i, :][ix[-1]] = 1
        print(num_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(num_to_char[ix[-1]])
    return ('').join(y_char)

Using TensorFlow backend.
  return f(*args, **kwds)


In [None]:
nb_epoch = 0
while nb_epoch<MAX_EPOCHS:
    print('\n\n')
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=1, epochs=1)
    nb_epoch += 1
    generate_text(model, GENERATE_LENGTH)
    if nb_epoch % 10 == 0:
        model.save_weights('checkpoint_{}_epoch_{}.hdf5'.format(HIDDEN_DIM, nb_epoch))




Epoch 1/1
Red to the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm the storm 