# Preprocessing
Naive bayes will be the base model, and it will struggle with stop words and common words. For that reason, we need to preprocess the data.

preprocessing steps include
1. cleaning extra newline characters
2. remove accented characters to ASCII
3. Expand contractions
4. Lowercase(?) text
5. Convert numbers to words
6. Remove numbers
7. Remove stop words (not for bert)
8. Lemmatization (convert to verb root, singular nouns)(nltk package)(not for bert)

There are a variety of python packages that will help with these steps. They are

# Table of packages here

In [None]:
# imports
import requests
import re
import pandas as pd
import os

In [None]:
os.chdir('..')
os.getcwd()


In [None]:
# get data
from src.data.make_dataset import get_book
# from data.raw.book_urls import book_urls # not working?
book_urls = {'great_gatsby':'https://www.gutenberg.org/cache/epub/64317/pg64317.txt',
'the_sun_also_rises':'https://www.gutenberg.org/cache/epub/67138/pg67138.txt',
'a_tale_of_two_cities':'https://www.gutenberg.org/cache/epub/98/pg98.txt'}


In [None]:
data_set = {}
for title, url in book_urls.items():
    print(url)
    data_set[title] = get_book(url)

# Bookends
Books in project gutenberg have lots of extra text at the end of the text file. Most of this is legalese and terms of use information and this is unneeded for the language modeling.

Let's look at some examples and then create a framework to remove it from the text

In [None]:
great_gatsby = data_set['great_gatsby']

In [None]:
great_gatsby[-9000:-5000]

In [None]:
# end pattern
print('*** END OF THE PROJECT GUTENBERG EBOOK THE GREAT GATSBY ***')
end_gutenberg = '*** END OF THE PROJECT GUTENBERG EBOOK THE GREAT GATSBY ***'

In [None]:
pattern = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK [\w\d\s]+ \*\*\*'
p = re.compile(pattern)
print('the span of the found text is ', re.search(pattern, great_gatsby).span())
print('the start of the found text is ', re.search(pattern, great_gatsby).start())
print('we should remove everything after the start fo the end of book pattern')
re.search(pattern, great_gatsby).start()

In [None]:
# all books in project gutenberg end with 
# *** END OF THE PROJECT GUTENBERG EBOOK {Title} ***
# *** END OF THE PROJECT GUTENBERG EBOOK THE GREAT GATSBY ***

import re
def remove_bookend(book:str)->str:
    """removes the extra end of the book in project gutenberg"""
    end_of_book_pattern = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK [\w\d\s]+ \*\*\*'
    match = re.search(end_of_book_pattern, book)
    if match is None:
        print('could not find project gutenberg ending')
        raise ValueError
    last_character = match.start()
    return book[:last_character]

### Unittest

In [None]:
test_book_ending ='This is the end of the book. *** END OF THE PROJECT GUTENBERG EBOOK THE the book title with number 10 ***'
test_book_no_ending = remove_bookend(test_book_ending)
assert test_book_no_ending == 'This is the end of the book. '

# Remove book start
This is a harder problem. 
We do know that all project gutenberg books have boiler plate starting text, which we can find and remove. However, some books have table of contents, other books have introductions, preambles, etc.


In [None]:
def remove_book_start(book:str)->str:
    """removes the boiler plate beginning part of the book in project gutenberg"""
    start_of_book_pattern = r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK [\w\d\s]+ \*\*\*'
    match = re.search(start_of_book_pattern, book)
    if match is None:
        print('could not find project gutenberg beginning')
        raise ValueError
    first_character = match.end()
    return book[first_character:]

## Unittest

In [None]:
test_book_start = '\ufeffThe Project Gutenberg eBook of The Great Gatsby        This ebook is for the use of anyone anywhere in the United States and  most other parts of the world at no cost and with almost no restrictions  whatsoever. You may copy it, give it away or re-use it under the terms  of the Project Gutenberg License included with this ebook or online  at www.gutenberg.org. If you are not located in the United States,  you will have to check the laws of the country where you are located  before using this eBook.    Title: The Great Gatsby      Author: F. Scott Fitzgerald    Release date: January 17, 2021 [eBook #64317]    Language: English        *** START OF THE PROJECT GUTENBERG EBOOK THE GREAT GATSBY ***          The Great Gatsby        by      F. Scott Fitzgerald                                 Table of Contents    I  II  III  IV  V  VI  VII  VIII  IX                                    Once again                                    to                                   Zelda      Then wear the go'
test_book_no_start = remove_book_start(test_book_start)
assert test_book_no_start == '          The Great Gatsby        by      F. Scott Fitzgerald                                 Table of Contents    I  II  III  IV  V  VI  VII  VIII  IX                                    Once again                                    to                                   Zelda      Then wear the go'

In [None]:
gg = remove_book_start(great_gatsby)
gg = remove_bookend(gg)
gg[:1000]

In [None]:
for title, book in data_set.items():
    book = remove_book_start(book)
    book = remove_bookend(book)
    data_set[title] = book

# remove unwanted characters

In [None]:
# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t", "\s\s\s"]:
    gg = gg.replace(char, " ")

In [None]:
# remove header, footer from gutenberg
gg[900:1000].split()

In [None]:
def remove_new_line_tabs(book):
    """remmove unwanted newlines, tabs, etc from the text"""
    for char in ["\n", "\r", "\d", "\t", "\s"]:
        book = book.replace(char, " ")
    return book


## Unittest

In [None]:
# unittest
test_book = 'hello \n I\'m trying\t to show \d the new tabs \s \r and how it gets \n broken up. \r Poetic!'
print(test_book)
test_book_ans = "hello   I'm trying  to show   the new tabs     and how it gets   broken up.   Poetic!"
assert test_book_ans == remove_new_line_tabs(test_book)
print(remove_new_line_tabs(test_book))

In [None]:
for title, book in data_set.items():
    print(f'processing {title}')
    data_set[title] = remove_new_line_tabs(book)

# Map authors to labels

In [None]:
id2author = {0:'Fitzgerald',1:'Hemingway',2:'Dickens'}
author2id = {value:key for (key, value) in id2author.items()}
print(author2id)

# Tokenize sentences

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
sentences = sent_tokenize(gg)
print(f'the sentences are of {type(sentences)} and there are {len(sentences)} many sentences')

In [None]:
x_sentences = 3
data = []
author = 'Fitzgerald'
for i in range(int(len(sentences)/x_sentences)):
    sentence_cluster = sentences[i*x_sentences:(i+1)*x_sentences] 
    data += [(sentence_cluster, author)]
print(i, len(data))

In [None]:
def convert_to_sentences(book, author_id, x_sentences=3):
    """returns x_sentences, author pairs """
    sentences = sent_tokenize(book)
    total_clusters = int(len(sentences)/x_sentences)
    data = []
    for i in range(total_clusters):
        sentence_cluster = sentences[i*x_sentences:(i+1)*x_sentences]
        data += [(sentence_cluster, author_id)]
        
    return data
author_id = author2id['Fitzgerald']
convert_to_sentences(gg, author_id)

In [None]:
authors = ('Fitzgerald','Hemingway','Dickens')
for title, author in zip(data_set.keys(), authors):
    if author == 'Fitzgerald': 
        author_id = author2id[author]
        print(title, author, author_id)
        book = data_set[title]
        data = convert_to_sentences(book, author_id, x_sentences=3)
    if title == 'Hemingway': 
        break

In [None]:
data[0]

In [None]:
data[-1]

# Shuffle data into train val test

In [None]:
train_frac = 0.6
val_frac = 0.1
test_frac = 0.3
assert train_frac + val_frac +test_frac == 1.0

## use numpy to shuffle data

In [None]:
import numpy as np
np.random.seed(123)
print(f'first record: {data[0]}')
np.random.shuffle(data)
print(f'\n\n\nfirst record after shuffle: {data[0]}')


In [None]:
total = len(data)
train = data[0:int(total*train_frac)]
val = data[int(total*train_frac):int(total*train_frac)+int(total*val_frac)]
test = data[int(total*train_frac)+int(total*val_frac):]

print(len(train), len(val), len(test), len(train)+len(val)+len(test))

In [None]:
np.random.seed(123)
def split_train_test_val(train_frac:float=0.6, val_frac:float=0.1, test_frac:float=0.3, data:list[tuple]=[])->tuple[list,list,list]:
    """takes a list of tuple examples, shuffles them, and splits them into train, val and test data sets based on fractions
    The tuples are
    ([sentences], label),
    ([sentences], label), ...
    
    Returns 3 lists, 
        train - list of tuples like ([train sentences], train label)
        val - list of tuples like ([validation sentences], validation label) 
        test - list of tuples like([test sentences], test label) 
    """
    assert train_frac + val_frac +test_frac == 1.0, 'tra, val, and test frac must sum to 1'
    
    # steps
    # 1. shuffle data
    # 2. split data into train, val and test slices
    # 3. return train, val, test
    
    total_examples = len(data)
    
    # shuffle the tuples
    np.random.shuffle(data)
    
    # slice random tuples
    train = data[0:int(total_examples*train_frac)]
    val = data[int(total_examples*train_frac):int(total_examples*train_frac)+int(total_examples*val_frac)]
    test = data[int(total_examples*train_frac)+int(total_examples*val_frac):]
    
    # prints the indecies of the train, val, and test lists. uncomment for debugging
    # print(0,int(total_examples*train_frac), int(total_examples*train_frac)+int(total_examples*val_frac), int(total_examples*train_frac)+int(total_examples*val_frac)+int(total_examples*test_frac))
    
    # print the total tuples in each set
    print(len(train), len(val), len(test), len(train)+len(val)+len(test))
    
    assert len(train)+len(val)+len(test) == total_examples, 'train, val, and test examples are wrong length'
    
    return train, val, test

train, val, test = split_train_test_val(train_frac, val_frac, test_frac, data)
train[0]

In [None]:
train[0:3]

In [None]:
author2id

In [None]:
authors = ('Fitzgerald','Hemingway','Dickens')
train = []
val = []
test = []
for title, author in zip(data_set.keys(), authors):
    
    author_id = author2id[author]
    print(title, author, author_id)
    book = data_set[title]
    data = convert_to_sentences(book, author_id, x_sentences=3)
    train_, val_, test_ = split_train_test_val(train_frac, val_frac, test_frac, data)
    train += train_
    val += val_
    test += test_
    
 
 # shuffle when done
np.random.shuffle(train)
np.random.shuffle(val)
np.random.shuffle(test)

In [None]:
np.random.shuffle(train)
np.random.shuffle(val)
np.random.shuffle(test)
train[0:5]

In [None]:
print(f'training examples: {len(train)}\nvalidation examples: {len(val)}\ntest examples: {len(test)}')

# save the data

In [None]:
import pickle
with open('data.pkl', 'wb') as f:
    pickle.dump([train, val, test], f)