# Preprocessing text

Input:
    
    urls to the books
    
output:

    X_train = list of sentences as bag of words
    
    y_train = author labels (0, 1, 2, ..., n)
    
    *same for train, test, and validation sets

### An example
What is an example? ie. X_train[0]?

Each example is a bunch of sentences that have been processed into a bag of words such that:

`'An example with two sentences. here\'s the second.'`

becomes:

`'an exampl with two sentenc here s the second'`


### The process
What I would like to do:

url -> book -> remove unwanted parts -> sentence tokenize -> stem -> reassemble -> pickle

In [1]:
# 22 Term Frequency inverse document frequency

In [2]:
# imports
import requests
import re
import pandas as pd
import os

In [3]:
os.chdir('..')
os.getcwd()


'/home/denny/Documents/mids/w266_NLP/lit-shazam'

`book_urls` contains a dictionary of key:value pairs of book title: web_url to gutenberg

In [4]:
from data.raw.book_urls import title2author, book_urls
print('the first two books:\n',list(book_urls.items())[:2])

the first two books:
 [('The Great Gatsby', 'https://www.gutenberg.org/cache/epub/64317/pg64317.txt'), ('This Side of Paradise', 'https://www.gutenberg.org/cache/epub/805/pg805.txt')]


fetch the data with the `get_book` function

In [5]:
from src.data.make_dataset import get_book
import time
data_set = {}
for title, url in book_urls.items():
    print(title,url)
    data_set[title] = get_book(url)
    time.sleep(7)

The Great Gatsby https://www.gutenberg.org/cache/epub/64317/pg64317.txt
This Side of Paradise https://www.gutenberg.org/cache/epub/805/pg805.txt
The Beautiful and the Damned https://www.gutenberg.org/cache/epub/9830/pg9830.txt
The Sun Also Rises https://www.gutenberg.org/cache/epub/67138/pg67138.txt
Men Without Women https://www.gutenberg.org/cache/epub/69683/pg69683.txt
In Our Time https://www.gutenberg.org/cache/epub/61085/pg61085.txt
time out
The Mayor of Casterbridge https://www.gutenberg.org/cache/epub/143/pg143.txt
Jude the Obscure https://www.gutenberg.org/cache/epub/153/pg153.txt
time out
Return of the Native https://www.gutenberg.org/cache/epub/122/pg122.txt
A Tale of Two Cities https://www.gutenberg.org/cache/epub/98/pg98.txt
Great Expectations https://www.gutenberg.org/cache/epub/1400/pg1400.txt
Bleak House https://www.gutenberg.org/cache/epub/1023/pg1023.txt
Emma https://www.gutenberg.org/cache/epub/158/pg158.txt
Sense and Sensibility https://www.gutenberg.org/cache/epub/16

remove unwanted sections of the book

In [25]:
import json
with open('./data/processed/data_set.json', 'w') as f:
    json.dump(data_set, f)



In [5]:
import json
with open('./data/processed/data_set.json', 'r') as f:
    data_set = json.load(f)

In [6]:
from src.data.make_dataset import remove_bookend, remove_new_line_tabs, remove_everything_before_starting_sentence
from data.raw.book_urls import book_starting_sentence

[nltk_data] Downloading package punkt to /home/denny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
for title, book in data_set.items():
    print(f'processing {title}')
#     book = remove_book_start(book)
    starting_sentence = book_starting_sentence[title]
    book = remove_everything_before_starting_sentence(book, starting_sentence)
    book = remove_bookend(book)
    book = remove_new_line_tabs(book)
    data_set[title] = book
    
    

processing The Great Gatsby
processing This Side of Paradise
processing The Beautiful and the Damned
processing The Sun Also Rises
processing Men Without Women
processing In Our Time
processing The Mayor of Casterbridge
processing Jude the Obscure
processing Return of the Native
processing A Tale of Two Cities
processing Great Expectations
processing Bleak House
processing Emma
processing Sense and Sensibility
processing Pride and Prejudice
processing The Wisdom of Father Brown
processing The Man Who Was Thursday
processing The Ball and the Cross
processing As You Like It
processing Julius Caesar
processing Hamlet


In [8]:
failed_book = 'The Wisdom of Father Brown'
title, url = failed_book, book_urls[failed_book]
data_set[title] = get_book(url)

NameError: name 'get_book' is not defined

In [9]:
# split to sentences

In [10]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/denny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# def convert_to_sentences(book, sentences_per_example=3):
#     """returns a list of (sentences_per_example, author) pairs """
#     sentences = sent_tokenize(book)
#     total_clusters = int(len(sentences)/sentences_per_example)
#     data = []
#     for i in range(total_clusters):
#         sentence_cluster = sentences[i*sentences_per_example:(i+1)*sentences_per_example]
#         data += [''.join(sentence_cluster)]
        
#     return data

In [12]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
def sentence_to_bag_of_words(sentence):
    """Converts words in a sentence into stemmed tokens"""
    # 1. lower case
    # 2. remove punctuation
    # 3. tokenize
    # 4. stem
    # 5. TODO: lem
    # 6. combine back together with spaces
    
    result = sentence.lower()
    

    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(result)
    
    
    porter = PorterStemmer()
    porter_tokens = [porter.stem(token) for token in token_list]
    
    bag_of_words = ' '.join(porter_tokens)
    
    return bag_of_words


In [13]:
def convert_examples_to_bag_of_words(examples:list):
    return [sentence_to_bag_of_words(batch_of_sentences) for batch_of_sentences in examples]
    

In [14]:
sentence_to_bag_of_words('Eighty-seven miles to go, yet.  Onward!')

'eighti seven mile to go yet onward'

In [15]:
test_examples = ['One example, with a sentence here!', 
 'An example with two sentences. here\'s the second.', 
 'A third example, with some awe-inspiring Examples of punck:tua-TION!']
convert_examples_to_bag_of_words(test_examples)

['one exampl with a sentenc here',
 'an exampl with two sentenc here s the second',
 'a third exampl with some awe inspir exampl of punck tua tion']

In [16]:
# TODO: fix bug where words like here's are not properly processed
# TODO: Lemmatization

{'Fitzgerald': 0, 'Hemingway': 1, 'Dickens': 2}


In [19]:
from data.raw.book_urls import title2author, author2id
from src.data.make_dataset import convert_to_sentences, convert_examples_to_bag_of_words

#Encode labels
author2id
title2author

{'The Great Gatsby': 'fitzgerald',
 'This Side of Paradise': 'fitzgerald',
 'The Beautiful and the Damned': 'fitzgerald',
 'The Sun Also Rises': 'hemingway',
 'Men Without Women': 'hemingway',
 'In Our Time': 'hemingway',
 'The Mayor of Casterbridge': 'hardy',
 'Jude the Obscure': 'hardy',
 'Return of the Native': 'hardy',
 'A Tale of Two Cities': 'dickens',
 'Great Expectations': 'dickens',
 'Bleak House': 'dickens',
 'Emma': 'austen',
 'Sense and Sensibility': 'austen',
 'Pride and Prejudice': 'austen',
 'The Wisdom of Father Brown': 'chesterton',
 'The Man Who Was Thursday': 'chesterton',
 'The Ball and the Cross': 'chesterton',
 'As You Like It': 'shakespeare',
 'Julius Caesar': 'shakespeare',
 'Hamlet': 'shakespeare'}

In [20]:
X, y = [],[]
for title, book in data_set.items():
    examples = convert_to_sentences(book, sentences_per_example=3)
    processed_examples = convert_examples_to_bag_of_words(examples)
    labels = [author2id[title2author[title]]] * len(examples)
    X += processed_examples
    y += labels
    
assert len(y) == len(X)

In [21]:
from sklearn.model_selection import train_test_split
# train val test split 60%, 20%, 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 100% = 80% train, 20% test

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [22]:
X_train[3].split()[0:10]

['i',
 'am',
 'glad',
 'volumnia',
 'repeat',
 'sir',
 'leicest',
 'unmind',
 'of',
 'these']

In [23]:
X_train[0:2]

['then we are both of one mind at last she said ye repli venn gloomili but if you would tell me miss whi you take such an interest in her i should be easier',
 'am i sir no jo close hi eye mutter i m weri thank after watch him close a littl while allan put hi mouth veri near hi ear and say to him in a low distinct voic jo did you ever know a prayer never knowd nothink sir not so much as one short prayer no sir nothink at all']

# Save data
X's = ['sit on tom s lap mr wilson call up sever peopl',

'on the telephon then there were no cigarett and i went out to buy',

'some at the drugstor on the corner when i came back they had' ...]

y's = [
0,

0,

2,...
]


In [24]:
import pickle
with open('./data/processed/nb_processed_data.pkl', 'wb') as f:
    pickle.dump((X_train, X_val, X_test, y_train, y_val, y_test), f)

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# cv = CountVectorizer()

# X_train = cv.fit_transform(X_train)
# X_val = cv.transform(X_val)
# X_test = cv.transform(X_test)

In [None]:
# save the data

In [None]:
# os.getcwd()