# Preprocessing text

Input:
    
    urls to the books
    
output:

    X_train = list of sentences as bag of words
    
    y_train = author labels (0, 1, 2, ..., n)
    
    *same for train, test, and validation sets

### An example
What is an example? ie. X_train[0]?

Each example is a bunch of sentences that have been processed into a bag of words such that:

`'An example with two sentences. here\'s the second.'`

becomes:

`'an exampl with two sentenc here s the second'`


### The process
What I would like to do:

url -> book -> remove unwanted parts -> sentence tokenize -> stem -> reassemble -> pickle

In [1]:
# 21 Term Frequency inverse document frequency

In [2]:
# imports
import requests
import re
import pandas as pd
import os

In [3]:
os.chdir('..')
os.getcwd()


'/home/denny/Documents/mids/w266_NLP/lit-shazam'

In [4]:
# get data
from src.data.make_dataset import get_book
# from data.raw.book_urls import book_urls # not working?
book_urls = {'great_gatsby':'https://www.gutenberg.org/cache/epub/64317/pg64317.txt',
'the_sun_also_rises':'https://www.gutenberg.org/cache/epub/67138/pg67138.txt',
'a_tale_of_two_cities':'https://www.gutenberg.org/cache/epub/98/pg98.txt'}


data_set = {}
for title, url in book_urls.items():
    print(url)
    data_set[title] = get_book(url)

https://www.gutenberg.org/cache/epub/64317/pg64317.txt
https://www.gutenberg.org/cache/epub/67138/pg67138.txt
https://www.gutenberg.org/cache/epub/98/pg98.txt


In [5]:
from src.data.make_dataset import remove_book_start, remove_bookend, remove_new_line_tabs

In [6]:
for title, book in data_set.items():
    print(f'processing {title}')
    book = remove_book_start(book)
    book = remove_bookend(book)
    book = remove_new_line_tabs(book)
    data_set[title] = book
    
    

processing great_gatsby
processing the_sun_also_rises
processing a_tale_of_two_cities


In [7]:
# split to sentences

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/denny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def convert_to_sentences(book, sentences_per_example=3):
    """returns a list of (sentences_per_example, author) pairs """
    sentences = sent_tokenize(book)
    total_clusters = int(len(sentences)/sentences_per_example)
    data = []
    for i in range(total_clusters):
        sentence_cluster = sentences[i*sentences_per_example:(i+1)*sentences_per_example]
        data += [''.join(sentence_cluster)]
        
    return data

In [10]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
def sentence_to_bag_of_words(sentence):
    """Converts words in a sentence into stemmed tokens"""
    # 1. lower case
    # 2. remove punctuation
    # 3. tokenize
    # 4. stem
    # 5. TODO: lem
    # 6. combine back together with spaces
    
    result = sentence.lower()
    

    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(result)
    
    
    porter = PorterStemmer()
    porter_tokens = [porter.stem(token) for token in token_list]
    
    bag_of_words = ' '.join(porter_tokens)
    
    return bag_of_words


In [11]:
def convert_examples_to_bag_of_words(examples:list):
    return [sentence_to_bag_of_words(batch_of_sentences) for batch_of_sentences in examples]
    

In [12]:
sentence_to_bag_of_words('Eighty-seven miles to go, yet.  Onward!')

'eighti seven mile to go yet onward'

In [13]:
test_examples = ['One example, with a sentence here!', 
 'An example with two sentences. here\'s the second.', 
 'A third example, with some awe-inspiring Examples of punck:tua-TION!']
convert_examples_to_bag_of_words(test_examples)

['one exampl with a sentenc here',
 'an exampl with two sentenc here s the second',
 'a third exampl with some awe inspir exampl of punck tua tion']

In [14]:
# TODO: fix bug where words like here's are not properly processed
# TODO: Lemmatization

In [15]:
#Encode labels
title2author = {'great_gatsby':'Fitzgerald','the_sun_also_rises':'Hemingway','a_tale_of_two_cities':'Dickens'}
id2author = {0:'Fitzgerald',1:'Hemingway',2:'Dickens'}
author2id = {value:key for (key, value) in id2author.items()}
print(author2id)

{'Fitzgerald': 0, 'Hemingway': 1, 'Dickens': 2}


In [16]:
X, y = [],[]
for title, book in data_set.items():
    examples = convert_to_sentences(book, sentences_per_example=3)
    processed_examples = convert_examples_to_bag_of_words(examples)
    labels = [author2id[title2author[title]]] * len(examples)
    X += processed_examples
    y += labels
    
assert len(y) == len(X)

In [17]:
from sklearn.model_selection import train_test_split
# train val test split 60%, 20%, 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 100% = 80% train, 20% test

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [18]:
X_train[3].split()[0:10]

['it', 'is', 'veri', 'import', 'to', 'discov', 'grace', 'exit', 'like', 'that']

In [19]:
X_train[0:2]

['sit on tom s lap mr wilson call up sever peopl on the telephon then there were no cigarett and i went out to buy some at the drugstor on the corner when i came back they had both disappear so i sat down discreetli in the live room and read a chapter of simon call peter either it wa terribl stuff or the whiski distort thing becaus it didn t make ani sens to me just as tom and myrtl after the first drink mr wilson and i call each other by our first name reappear compani commenc to arriv at the apart door',
 'that wa the last day befor the fiesta chapter 15 at noon of sunday the 6th of juli the fiesta explod there is no other way to describ it']

# Save data
X's = ['sit on tom s lap mr wilson call up sever peopl',

'on the telephon then there were no cigarett and i went out to buy',

'some at the drugstor on the corner when i came back they had' ...]

y's = [
0,

0,

2,...
]


In [20]:
import pickle
with open('./data/processed/nb_processed_data.pkl', 'wb') as f:
    pickle.dump((X_train, X_val, X_test, y_train, y_val, y_test), f)

In [21]:
# from sklearn.feature_extraction.text import CountVectorizer

In [22]:
# cv = CountVectorizer()

# X_train = cv.fit_transform(X_train)
# X_val = cv.transform(X_val)
# X_test = cv.transform(X_test)

In [23]:
# save the data

In [24]:
# os.getcwd()