Notebook to develop test data using NLTK package and Project Gutenberg. The NLTK corpus consists of 18 works of literature including novels, plays, peoms, and the King James Bible. The version in this notebook will use a balanced training dataset selected only from the subset of novels from the corpus. Notebook will store the data.frame objects as parquet format files for retrieval by downstream notebooks.

In [None]:

!pip install pydot --quiet
!pip install nltk --quiet
!pip install pyarrow -quiet



Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -u


In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd

import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt
import random
import re

nltk.download('gutenberg')
from nltk.corpus import gutenberg

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from google.colab import drive

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:

# Get a list of file IDs (book IDs) from Project Gutenberg
book_ids = gutenberg.fileids()
book_ids

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [None]:
#Select a subset of book ids for novels only
# selecting one work per author, ID #s 0,5,6,7,10,11,12
book_list = [book_ids[0], book_ids[1], book_ids[2], book_ids[5], book_ids[6], book_ids[7], book_ids[8], book_ids[9], book_ids[10], book_ids[11], book_ids[12]]
book_list

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt']

In [None]:
# ***************************************************************************
# store author and tokenized sentences
# ***************************************************************************
B = len(book_list) # Number of books
S = 3 # Set size of each passage, number of sentences, default is 3 sentence chunks

authors = [""]*B
books = [""]*B
short_titles = [""]*B
titles = [""]*B
data = []

for indx, book in enumerate(book_list):
  # format is "author-short_title.txt"
  # split on '-' to retrieve 'author'
  # then split remainder to remove '.txt'
  authors[indx] = book.split('-')[0]
  short_titles[indx] = book.split('-')[1].split('.')[0]

  #print(short_titles)

  # returns each sentence as a list of word strings
  sentences = gutenberg.sents(book)

  # First sentence contains the full title and author, extract and store
  title_sentence = sentences[0]
  title = ' '.join(title_sentence)
  titles[indx] = title

  # gutenberg.sent(...) returns a sentence as an array of words
  # Remove title sentence and concatenate the words into a sentence as a single string
  # instead of as a single sentence, to store the set of word tokens, use sentences[1:]
  sentences = sentences[1:]
  group_sentences = [' '.join(sentence) for sentence in sentences]

  books[indx] = group_sentences

  data.append({"Author": authors[indx], "Short Title": short_titles[indx],
               "Title": title,
               "Sentence Count": len(books[indx])})

df_books = pd.DataFrame(data)

In [None]:
# UNIT TEST
len(books[0])
df_books


Unnamed: 0,Author,Short Title,Title,Sentence Count
0,austen,emma,[ Emma by Jane Austen 1816 ],7751
1,austen,persuasion,[ Persuasion by Jane Austen 1818 ],3746
2,austen,sense,[ Sense and Sensibility by Jane Austen 1811 ],4998
3,bryant,stories,[ Stories to Tell to Children by Sara Cone Bry...,2862
4,burgess,busterbrown,[ The Adventures of Buster Bear by Thornton W ...,1053
5,carroll,alice,[ Alice ' s Adventures in Wonderland by Lewis ...,1702
6,chesterton,ball,[ The Ball and The Cross by G . K . Chesterton...,4778
7,chesterton,brown,[ The Wisdom of Father Brown by G . K . Cheste...,3805
8,chesterton,thursday,[ The Man Who Was Thursday by G . K . Chestert...,3741
9,edgeworth,parents,"[ The Parent ' s Assistant , by Maria Edgeworth ]",10229


In [None]:
#***********************************************
# Create paragraph groups of size chunk_size
#***********************************************
chunk_size = 4
book_groups = []

for i, book in enumerate(books):
  combined_sents = []
  for j in range(0, len(book), chunk_size):
    group = book[j:j+chunk_size]
    new_str = "".join(group)
    combined_sents.append(new_str)

  book_groups.append(combined_sents)

In [None]:
# Store in a dataframe
df_books["Sentence Groups"] = book_groups
df_books["Group Counts"] = df_books["Sentence Groups"].apply(lambda x: len(x))
df_books

Unnamed: 0,Author,Short Title,Title,Sentence Count,Sentence Groups,Group Counts
0,austen,emma,[ Emma by Jane Austen 1816 ],7751,"[VOLUME ICHAPTER IEmma Woodhouse , handsome , ...",1938
1,austen,persuasion,[ Persuasion by Jane Austen 1818 ],3746,"[Chapter 1Sir Walter Elliot , of Kellynch Hall...",937
2,austen,sense,[ Sense and Sensibility by Jane Austen 1811 ],4998,[CHAPTER 1The family of Dashwood had long been...,1250
3,bryant,stories,[ Stories to Tell to Children by Sara Cone Bry...,2862,[TWO LITTLE RIDDLES IN RHYMEThere ' s a garden...,716
4,burgess,busterbrown,[ The Adventures of Buster Bear by Thornton W ...,1053,[IBUSTER BEAR GOES FISHINGBuster Bear yawned a...,264
5,carroll,alice,[ Alice ' s Adventures in Wonderland by Lewis ...,1702,[CHAPTER I .Down the Rabbit - HoleAlice was be...,426
6,chesterton,ball,[ The Ball and The Cross by G . K . Chesterton...,4778,[I .A DISCUSSION SOMEWHAT IN THE AIRThe flying...,1195
7,chesterton,brown,[ The Wisdom of Father Brown by G . K . Cheste...,3805,[I .The Absence of Mr GlassTHE consulting - ro...,952
8,chesterton,thursday,[ The Man Who Was Thursday by G . K . Chestert...,3741,[To Edmund Clerihew BentleyA cloud was on the ...,936
9,edgeworth,parents,"[ The Parent ' s Assistant , by Maria Edgeworth ]",10229,[THE ORPHANS .Near the ruins of the castle of ...,2558


In [None]:
# ****************************************************************
# PREPARE DATAFRAME
#
# Random shuffle groups of sentences as a unit, then store the first set
# as Training, second set at Validation, and remaining sentences as
# Testing. Since we shuffle at first, taking Train, Valid, Test sequentially
# is still random.
# ***************************************************************************

# Select Train, Valid, Test split
train_split = 0.7
valid_split = 0.1
test_split  = 0.2

# Create data structure to put into a dataframe
data_train = []
data_valid = []
data_test  = []

# B number of books
for i in range(B):
  author = authors[i]
  short_title = short_titles[i]
  title = titles[i]

  # passages contains the sentences for book i
  n = len(book_groups[i])

  train_split_index = int(n*train_split)
  valid_split_index = int(n*valid_split)
  test_split_index  = int(n*test_split)

  # use group as temp store in order to preserve order in book_group[i]
  group = book_groups[i].copy()
  random.shuffle(group)

  #train_group = book_groups[i][:train_split_index]
  train_group = group[:train_split_index]
  valid_group = group[train_split_index:(train_split_index+valid_split_index)]
  test_group  = group[train_split_index+valid_split_index:]

  data_train.append(train_group)
  data_valid.append(valid_group)
  data_test.append(test_group)

df_books["Train"] = data_train
df_books["Valid"] = data_valid
df_books["Test"]  = data_test
df_books

Unnamed: 0,Author,Short Title,Title,Sentence Count,Sentence Groups,Group Counts,Train,Valid,Test
0,austen,emma,[ Emma by Jane Austen 1816 ],7751,"[VOLUME ICHAPTER IEmma Woodhouse , handsome , ...",1938,"["" I dare say they are , sir .I am sure I do n...",[There was certainly no harm in his travelling...,"["" To a Mrs . Smallridge -- charming woman -- ..."
1,austen,persuasion,[ Persuasion by Jane Austen 1818 ],3746,"[Chapter 1Sir Walter Elliot , of Kellynch Hall...",937,"["" But it rains ."""" Oh !very little , Nothing ...","[Between these two , she could want no possibl...",[Lady Russell and Anne paid their compliments ...
2,austen,sense,[ Sense and Sensibility by Jane Austen 1811 ],4998,[CHAPTER 1The family of Dashwood had long been...,1250,"[He was not an ill - disposed young man , unle...","[A discovery took place ,""-- here he hesitated...","[Yes , Marianne , even in a man between thirty..."
3,bryant,stories,[ Stories to Tell to Children by Sara Cone Bry...,2862,[TWO LITTLE RIDDLES IN RHYMEThere ' s a garden...,716,"["" Begorra , now , I ' ll have yees Widout muc...","["" Wolf !Wolf !Come and help !The wolves are a...","["" Run !run !as fast as you can !"" You can ' t..."
4,burgess,busterbrown,[ The Adventures of Buster Bear by Thornton W ...,1053,[IBUSTER BEAR GOES FISHINGBuster Bear yawned a...,264,[When Little Joe Otter had told how Farmer Bro...,[Then he jumped on the helpless pail .With a b...,"[Peter Rabbit , sitting bolt upright under a t..."
5,carroll,alice,[ Alice ' s Adventures in Wonderland by Lewis ...,1702,[CHAPTER I .Down the Rabbit - HoleAlice was be...,426,[' As if it wasn ' t trouble enough hatching t...,[' That ' s very curious !'she thought .' But ...,[Who would not give all else for two Pennywort...
6,chesterton,ball,[ The Ball and The Cross by G . K . Chesterton...,4778,[I .A DISCUSSION SOMEWHAT IN THE AIRThe flying...,1195,[We must be eternally on our guard ; we must l...,"[cried MacIan , scornfully ."" There are a few ...",[I ' ll not fight foul for all the girls and a...
7,chesterton,brown,[ The Wisdom of Father Brown by G . K . Cheste...,3805,[I .The Absence of Mr GlassTHE consulting - ro...,952,"["" There ' s a disadvantage in a stick pointin...",[The Rev .Mr Brown broke into a rather childis...,"[Well , I ' ll simply tell you , in the fewest..."
8,chesterton,thursday,[ The Man Who Was Thursday by G . K . Chestert...,3741,[To Edmund Clerihew BentleyA cloud was on the ...,936,"[asked Syme with eager eyes ."" I will take you...",[One asked whether a bushy beard would hide my...,"[For these disguises did not disguise , but re..."
9,edgeworth,parents,"[ The Parent ' s Assistant , by Maria Edgeworth ]",10229,[THE ORPHANS .Near the ruins of the castle of ...,2558,[The Landlord and Farmer join them .)Lord J . ...,"[Don ' t you remember the circumstance ?"""" I h...",[Now I have told you all I know ; and so I hop...


In [None]:
type(df_books.loc[0,"Sentence Groups"][0])

str

In [None]:
# Store dataframe to Drive in parquet format
# Mount a google Drive for persistent store
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pyarrow as pya

df_books.to_parquet("nltk_corpus_df_chunks.parquet")
!mv "nltk_corpus_df_chunks.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_df_chunks.parquet"

In [None]:
# Test parquet file retrieval
# read into a pyarrow table
# NOTE: list arrays before store get converted to numpy.ndarrays after recalling from Drive
table = pya.parquet.read_table("/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_df_chunks.parquet")
df = table.to_pandas()
df

In [None]:
# Prepare data for binary classification model
# Form training and validation data composed of the 3 Jane Austen novels
# Form testing data randomly selected among all the novels
train = []
valid = []
test  = []

for indx, row in df_books.iterrows():
  if indx in range (0,3):
    label_train = [1]*len(row["Train"])
    label_valid = [1]*len(row["Valid"])
    label_test = [1]*len(row["Test"])
  else:
    label_train = [0]*len(row["Train"])
    label_valid = [0]*len(row["Valid"])
    label_test = [0]*len(row["Test"])

  zipped_train = list(zip(row["Train"],label_train))
  zipped_valid = list(zip(row["Valid"],label_valid))
  zipped_test = list(zip(row["Test"],label_test))

  train.append(zipped_train)
  valid.append(zipped_valid)
  test.append(zipped_test)

In [None]:
#flatten the list using list comprehension then shuffle
train_shuffled = [item for sublist in train for item in sublist]
random.shuffle(train_shuffled)

valid_shuffled = [item for sublist in valid for item in sublist]
random.shuffle(valid_shuffled)

test_shuffled = [item for sublist in test for item in sublist]
random.shuffle(test_shuffled)

df_binary_data_train = pd.DataFrame(train_shuffled, columns=['Train Data','Train Label'])
df_binary_data_valid = pd.DataFrame(valid_shuffled, columns=['Valid Data','Valid Label'])
df_binary_data_test  = pd.DataFrame(test_shuffled,  columns=['Test Data' ,'Test Label'])

df_binary_data_train.to_parquet("nltk_corpus_train_data_binary.parquet")
df_binary_data_valid.to_parquet("nltk_corpus_valid_data_binary.parquet")
df_binary_data_test.to_parquet("nltk_corpus_test_data_binary.parquet")

!mv "nltk_corpus_train_data_binary.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_train_data_binary.parquet"
!mv "nltk_corpus_valid_data_binary.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_valid_data_binary.parquet"
!mv "nltk_corpus_test_data_binary.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_test_data_binary.parquet"


In [None]:
# *******************************************
# Prepare data for multiclass classification model
# *******************************************
train = []
valid = []
test  = []

for indx, row in df_books.iterrows():
  #print(len(row["Train"]))
  label_train = [indx]*len(row["Train"])
  label_valid = [indx]*len(row["Valid"])
  label_test  = [indx]*len(row["Test"])

  zipped_train = list(zip(row["Train"],label_train))
  zipped_valid = list(zip(row["Valid"],label_valid))
  zipped_test = list(zip(row["Test"],label_test))

  #print(zipped_train)

  train.append(zipped_train)
  valid.append(zipped_valid)
  test.append(zipped_test)

In [None]:
#flatten the list using list comprehension then shuffle
train_shuffled = [item for sublist in train for item in sublist]
random.shuffle(train_shuffled)

valid_shuffled = [item for sublist in valid for item in sublist]
random.shuffle(valid_shuffled)

test_shuffled = [item for sublist in test for item in sublist]
random.shuffle(test_shuffled)

df_multi_data_train = pd.DataFrame(train_shuffled, columns=['Train Data','Train Label'])
df_multi_data_valid = pd.DataFrame(valid_shuffled, columns=['Valid Data','Valid Label'])
df_multi_data_test  = pd.DataFrame(test_shuffled,  columns=['Test Data' ,'Test Label'])

df_multi_data_train.to_parquet("nltk_corpus_train_data_multi.parquet")
df_multi_data_valid.to_parquet("nltk_corpus_valid_data_multi.parquet")
df_multi_data_test.to_parquet("nltk_corpus_test_data_multi.parquet")

!mv "nltk_corpus_train_data_multi.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_train_data_multi.parquet"
!mv "nltk_corpus_valid_data_multi.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_valid_data_multi.parquet"
!mv "nltk_corpus_test_data_multi.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_test_data_multi.parquet"
