In [1]:
from textbook_analysis.helpers import *
import nltk

In [2]:
input_dir = 'final_textbook_txts'
bert_input_file = 'all_textbook_data.txt'

In [3]:
def get_sentences(book, remove_stopwords=False, remove_numeric=False, remove_short=False):
    sents = nltk.sent_tokenize(book)
    return [' '.join(clean_text(s, remove_stopwords=remove_stopwords, remove_numeric=remove_numeric, remove_short=remove_short)) for s in sents]

In [4]:
def generate_bert_input_file(books, bert_input_file):
    # Combine all books into a line-by-line dataset for BERT
    print("Generating BERT input dataset file...")
    file = open(bert_input_file,"w")
    lines = []
    
    for title, book in books.items():
        for sent in get_sentences(book):
            if len(sent) >= 10:
                lines.extend([sent + '.\n'])
        lines.extend(['\n'])
        
    file.writelines(lines)
    file.close()
    return lines

In [5]:
books = get_book_txts(input_dir, splitlines=False)
lines = generate_bert_input_file(books, bert_input_file)

Getting books...
America_A_Narrative_History_WWNorton_10th
America_Past_And_Present_Pearson_10th
Americas_History_Bedford_8th
Give_Me_Liberty_An_American_History_WWNorton_3rd
The_American_Pageant_Cengage_14th
The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th
Visions_of_America_A_History_of_the_United_States_Pearson_2nd
american_history_connecting_with_the_past
by_the_people
history_alive_united_states_thru_industrialism
hmh_the_americans_us_history_since_1877
mastering_the_teks
pearson_us_history
teks_us_history
us_history_early_colonial_period_through_reconstruction
Finished getting books.
Generating BERT input dataset file...


In [6]:
len(lines)

310000

In [31]:
book_chapters = dict()
for b in books:
    text = books[b].lower()
    book_chapters[b] = []
    is_first_chunk = True
    while True:
        idx = text.find('chapter')
        prefix = '' if is_first_chunk else 'chapter'
        if idx == -1:
            book_chapters[b].append(prefix+text)
            break
        book_chapters[b].append(prefix+text[:idx])
        text = text[idx+7:]
        if is_first_chunk:
            is_first_chunk = False

In [33]:
import random

book_chapters_train = dict()
book_chapters_eval = dict()
for b, chapters in book_chapters.items():
    n_ch = len(chapters)
    all_ch_idx = set(range(n_ch))
    eval_ch_idx = set(random.sample(all_ch_idx, int(n_ch*0.2)))
    train_ch_idx = all_ch_idx - eval_ch_idx
    
    book_chapters_train[b] = [chapters[ch] for ch in list(train_ch_idx)]
    book_chapters_eval[b] = [chapters[ch] for ch in list(eval_ch_idx)]

In [41]:
train_books = dict()
eval_books = dict()
for b in book_chapters_train:
    train_chapters = book_chapters_train[b]
    eval_chapters = book_chapters_eval[b]
    train_books[b] = ''.join(train_chapters)
    eval_books[b] = ''.join(eval_chapters)

In [44]:
train_input_file = 'train_textbook_data.txt'
eval_input_file = 'eval_textbook_data.txt'

In [45]:
train_lines = generate_bert_input_file(train_books, train_input_file)

Generating BERT input dataset file...


In [46]:
len(train_lines)

242155

In [47]:
eval_lines = generate_bert_input_file(eval_books, eval_input_file)

Generating BERT input dataset file...


In [48]:
len(eval_lines)

54960

In [49]:
len(train_lines)+len(eval_lines)

297115