# Preprocessing text

Input:
    
    urls to the books
    
output:

    X_train = list of sentences as bag of words
    
    y_train = author labels (0, 1, 2, ..., n)
    
    *same for train, test, and validation sets

### An example
What is an example? ie. X_train[0]?

Each example is a bunch of sentences that have been processed into a bag of words such that:

`'An example with two sentences. here\'s the second.'`

becomes:

`'an exampl with two sentenc here s the second'`


### The process
What I would like to do:

url -> book -> remove unwanted parts -> sentence tokenize -> stem -> reassemble -> pickle

In [None]:
# 21 Term Frequency inverse document frequency

In [None]:
# imports
import requests
import re
import pandas as pd
import os

In [None]:
os.chdir('..')
os.getcwd()


In [None]:
# get data
from src.data.make_dataset import get_book
# from data.raw.book_urls import book_urls # not working?
book_urls = {'great_gatsby':'https://www.gutenberg.org/cache/epub/64317/pg64317.txt',
'the_sun_also_rises':'https://www.gutenberg.org/cache/epub/67138/pg67138.txt',
'a_tale_of_two_cities':'https://www.gutenberg.org/cache/epub/98/pg98.txt'}


data_set = {}
for title, url in book_urls.items():
    print(url)
    data_set[title] = get_book(url)

In [None]:
from src.data.make_dataset import remove_book_start, remove_bookend, remove_new_line_tabs

In [None]:
for title, book in data_set.items():
    print(f'processing {title}')
    book = remove_book_start(book)
    book = remove_bookend(book)
    book = remove_new_line_tabs(book)
    data_set[title] = book
    
    

In [None]:
# split to sentences

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
def convert_to_sentences(book, sentences_per_example=3):
    """returns a list of (sentences_per_example, author) pairs """
    sentences = sent_tokenize(book)
    total_clusters = int(len(sentences)/sentences_per_example)
    data = []
    for i in range(total_clusters):
        sentence_cluster = sentences[i*sentences_per_example:(i+1)*sentences_per_example]
        data += [''.join(sentence_cluster)]
        
    return data

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
def sentence_to_bag_of_words(sentence):
    """Converts words in a sentence into stemmed tokens"""
    # 1. lower case
    # 2. remove punctuation
    # 3. tokenize
    # 4. stem
    # 5. TODO: lem
    # 6. combine back together with spaces
    
    result = sentence.lower()
    

    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(result)
    
    
    porter = PorterStemmer()
    porter_tokens = [porter.stem(token) for token in token_list]
    
    bag_of_words = ' '.join(porter_tokens)
    
    return bag_of_words


In [None]:
def convert_examples_to_bag_of_words(examples:list):
    return [sentence_to_bag_of_words(batch_of_sentences) for batch_of_sentences in examples]
    

In [None]:
sentence_to_bag_of_words('Eighty-seven miles to go, yet.  Onward!')

In [None]:
test_examples = ['One example, with a sentence here!', 
 'An example with two sentences. here\'s the second.', 
 'A third example, with some awe-inspiring Examples of punck:tua-TION!']
convert_examples_to_bag_of_words(test_examples)

In [None]:
# TODO: fix bug where words like here's are not properly processed
# TODO: Lemmatization

In [None]:
#Encode labels
title2author = {'great_gatsby':'Fitzgerald','the_sun_also_rises':'Hemingway','a_tale_of_two_cities':'Dickens'}
id2author = {0:'Fitzgerald',1:'Hemingway',2:'Dickens'}
author2id = {value:key for (key, value) in id2author.items()}
print(author2id)

In [None]:
X, y = [],[]
for title, book in data_set.items():
    examples = convert_to_sentences(book, sentences_per_example=3)
    processed_examples = convert_examples_to_bag_of_words(examples)
    labels = [author2id[title2author[title]]] * len(examples)
    X += processed_examples
    y += labels
    
assert len(y) == len(X)

In [None]:
from sklearn.model_selection import train_test_split
# train val test split 60%, 20%, 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 100% = 80% train, 20% test

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [None]:
X_train[3].split()[0:10]

In [None]:
X_train[0:2]

# Save data
X's = ['sit on tom s lap mr wilson call up sever peopl',

'on the telephon then there were no cigarett and i went out to buy',

'some at the drugstor on the corner when i came back they had' ...]

y's = [
0,

0,

2,...
]


In [None]:
import pickle
with open('./data/processed/2nb_processed_data.pkl', 'wb') as f:
    pickle.dump((X_train, X_val, X_test, y_train, y_val, y_test), f)

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# cv = CountVectorizer()

# X_train = cv.fit_transform(X_train)
# X_val = cv.transform(X_val)
# X_test = cv.transform(X_test)

In [None]:
# save the data

In [None]:
# os.getcwd()