In [1]:
from textbook_analysis.helpers import *
import nltk
import random

In [2]:
def get_sentences(book, remove_stopwords=False, remove_numeric=False, remove_short=False):
    sents = nltk.sent_tokenize(book)
    return [' '.join(clean_text(s, remove_stopwords=remove_stopwords, remove_numeric=remove_numeric, remove_short=remove_short)) for s in sents]

In [3]:
def generate_bert_input_file(books, bert_input_file):
    # Combine all books into a line-by-line dataset for BERT
    # 1. Segment each book into sentences and tokenize 
    # 2. Clean tokens and add the line (sentence)
    print("Generating BERT input dataset file...")
    file = open(bert_input_file,"w")
    lines = []
    
    for title, book in books.items():
        for sent in get_sentences(book):
            if len(sent) >= 10:
                lines.extend([sent + '.\n'])
        lines.extend(['\n'])
        
    file.writelines(lines)
    file.close()
    return lines

In [4]:
def split_books_into_chapters(books):
    # Create dict mapping books to list of chapters
    book_chapters = dict()
    for b in books:
        text = books[b].lower()
        book_chapters[b] = []
        is_first_chunk = True
        while True:
            idx = text.find('chapter')
            prefix = '' if is_first_chunk else 'chapter'
            if idx == -1:
                book_chapters[b].append(prefix+text)
                break
            book_chapters[b].append(prefix+text[:idx])
            text = text[idx+7:]
            if is_first_chunk:
                is_first_chunk = False
    return book_chapters

In [5]:
def combine_chapters_into_single_text(book_chapters):
    # Combine list of chapters into single string of text per book
    book_text = dict()
    for b, chapters in book_chapters.items():
        book_text[b] = ''.join(chapters)
    return book_text

### Read in all textbook data & split into chapters

In [6]:
input_dir = 'final_textbook_txts'
books = get_book_txts(input_dir, splitlines=False)

Getting books...
America_A_Narrative_History_WWNorton_10th
America_Past_And_Present_Pearson_10th
Americas_History_Bedford_8th
Give_Me_Liberty_An_American_History_WWNorton_3rd
The_American_Pageant_Cengage_14th
The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th
Visions_of_America_A_History_of_the_United_States_Pearson_2nd
american_history_connecting_with_the_past
by_the_people
history_alive_united_states_thru_industrialism
hmh_the_americans_us_history_since_1877
mastering_the_teks
pearson_us_history
teks_us_history
us_history_early_colonial_period_through_reconstruction
Finished getting books.


In [7]:
book_chapters = split_books_into_chapters(books)

### Generate training file with ALL textbook data

In [None]:
bert_input_file = 'all_textbook_data.txt'
lines = generate_bert_input_file(books, bert_input_file)

In [None]:
print(len(lines))

### Generate 80-20 train-eval split

In [None]:
# Randomly split chapters into 80-20 train-eval for each book
book_chapters_train = dict()
book_chapters_eval = dict()
for b, chapters in book_chapters.items():
    n_ch = len(chapters)
    all_ch_idx = set(range(n_ch))
    eval_ch_idx = set(random.sample(all_ch_idx, int(n_ch*0.2)))
    train_ch_idx = all_ch_idx - eval_ch_idx
    
    book_chapters_train[b] = [chapters[ch] for ch in sorted(train_ch_idx)]
    book_chapters_eval[b] = [chapters[ch] for ch in sorted(eval_ch_idx)]

In [None]:
# Combine list of chapters into single string of text per book
train_books = combine_chapters_into_single_text(book_chapters_train)
eval_books = combine_chapters_into_single_text(book_chapters_eval)

In [None]:
# Create line-by-line train & eval txt files
train_input_file = 'train_textbook_data.txt'
eval_input_file = 'eval_textbook_data.txt'

train_lines = generate_bert_input_file(train_books, train_input_file)
eval_lines = generate_bert_input_file(eval_books, eval_input_file)

In [None]:
print(len(train_lines))
print(len(eval_lines))

### Generate 80-10-10 train-dev-test split

In [8]:
# Randomly split chapters into 80-10-10 train-dev-test for each book
book_chapters_train = dict()
book_chapters_dev = dict()
book_chapters_test = dict()
for b, chapters in book_chapters.items():
    n_ch = len(chapters)
    all_ch_idx = set(range(n_ch))
    test_ch_idx = set(random.sample(all_ch_idx, int(n_ch*0.1)))
    dev_ch_idx = set(random.sample(all_ch_idx - test_ch_idx, int(n_ch*0.1)))
    train_ch_idx = all_ch_idx - dev_ch_idx - test_ch_idx
    
    book_chapters_train[b] = [chapters[ch] for ch in sorted(train_ch_idx)]
    book_chapters_dev[b] = [chapters[ch] for ch in sorted(dev_ch_idx)]
    book_chapters_test[b] = [chapters[ch] for ch in sorted(test_ch_idx)]

In [9]:
# Combine list of chapters into single string of text per book
train_books = combine_chapters_into_single_text(book_chapters_train)
dev_books = combine_chapters_into_single_text(book_chapters_dev)
test_books = combine_chapters_into_single_text(book_chapters_test)

In [11]:
# Create line-by-line train, dev, & test txt files
train_input_file = 'data/80_10_10/train_textbook_data.txt'
dev_input_file = 'data/80_10_10/dev_textbook_data.txt'
test_input_file = 'data/80_10_10/test_textbook_data.txt'

train = generate_bert_input_file(train_books, train_input_file)
dev = generate_bert_input_file(dev_books, dev_input_file)
test = generate_bert_input_file(test_books, test_input_file)

Generating BERT input dataset file...
Generating BERT input dataset file...
Generating BERT input dataset file...


In [12]:
print(len(train))
print(len(dev))
print(len(test))

244040
24851
28231


In [15]:
for b, chapters in book_chapters.items():
    print(len(chapters), b)

717 America_A_Narrative_History_WWNorton_10th
513 America_Past_And_Present_Pearson_10th
930 Americas_History_Bedford_8th
132 Give_Me_Liberty_An_American_History_WWNorton_3rd
541 The_American_Pageant_Cengage_14th
309 The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th
800 Visions_of_America_A_History_of_the_United_States_Pearson_2nd
337 american_history_connecting_with_the_past
674 by_the_people
315 history_alive_united_states_thru_industrialism
572 hmh_the_americans_us_history_since_1877
342 mastering_the_teks
5 pearson_us_history
169 teks_us_history
504 us_history_early_colonial_period_through_reconstruction
