Notebook to develop test data using NLTK package and Project Gutenberg. The NLTK corpus consists of 18 works of literature including novels, plays, peoms, and the King James Bible. The version in this notebook will use a balanced training dataset selected only from the subset of novels from the corpus. Notebook will store the data.frame objects as parquet format files for retrieval by downstream notebooks.

In [None]:

!pip install pydot --quiet
!pip install nltk --quiet
!pip install pyarrow -quiet



Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -u


In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd

import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt
import random
import re

nltk.download('gutenberg')
from nltk.corpus import gutenberg

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pyarrow as pya
import pyarrow.parquet as pq

from google.colab import drive

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Store dataframe to Drive in parquet format
# Mount a google Drive for persistent store
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load novels from Project Gutenberg

In [None]:
import requests

In [None]:
# LOAD INDIVIDUAL NOVELS
bks_gutenberg = []

# ########################
# F. Scott Fitzgerald
# ########################
# the great gatsby
r = requests.get(r'https://www.gutenberg.org/cache/epub/64317/pg64317.txt')
bks_gutenberg.append(r.text)

# this side of paradise
r = requests.get(r'https://www.gutenberg.org/cache/epub/805/pg805.txt')
bks_gutenberg.append(r.text)

# beautiful and damned
r = requests.get(r'https://www.gutenberg.org/cache/epub/9830/pg9830.txt')
bks_gutenberg.append(r.text)

# ########################
# Hemingway
# ########################
# the sun also rises
r = requests.get(r'https://www.gutenberg.org/cache/epub/67138/pg67138.txt')
bks_gutenberg.append(r.text)

# Men Without Women
r = requests.get(r'https://www.gutenberg.org/cache/epub/69683/pg69683.txt')
bks_gutenberg.append(r.text)

# In Our Time
r = requests.get(r'https://www.gutenberg.org/cache/epub/61085/pg61085.txt')
bks_gutenberg.append(r.text)

# ########################
# Dickens
# ########################
# a tale of two cities
r = requests.get(r'https://www.gutenberg.org/cache/epub/98/pg98.txt')
bks_gutenberg.append(r.text)

# Great Expectations
r = requests.get(r'https://www.gutenberg.org/cache/epub/1400/pg1400.txt')
bks_gutenberg.append(r.text)

# Bleak House
r = requests.get(r'https://www.gutenberg.org/cache/epub/1023/pg1023.txt')
bks_gutenberg.append(r.text)

# ########################
# Thomas Hardy
# ########################
# Mayor of Casterbridge
r = requests.get(r'https://www.gutenberg.org/cache/epub/143/pg143.txt')
bks_gutenberg.append(r.text)

# Jude the Obscure
r = requests.get(r'https://www.gutenberg.org/cache/epub/153/pg153.txt')
bks_gutenberg.append(r.text)

# Return of the Native
r = requests.get(r'https://www.gutenberg.org/cache/epub/122/pg122.txt')
bks_gutenberg.append(r.text)

# ########################
# Jane Austen
# ########################
# Emma
r = requests.get(r'https://www.gutenberg.org/cache/epub/158/pg158.txt')
bks_gutenberg.append(r.text)

# Sense
r = requests.get(r'https://www.gutenberg.org/cache/epub/161/pg161.txt')
bks_gutenberg.append(r.text)

# Pride
r = requests.get(r'https://www.gutenberg.org/cache/epub/1342/pg1342.txt')
bks_gutenberg.append(r.text)

# ########################
# Chesterton
# ########################
# Wisdon of Father Brown
r = requests.get(r'https://www.gutenberg.org/cache/epub/223/pg223.txt')
bks_gutenberg.append(r.text)

# The Man Who Was Thursday
r = requests.get(r'https://www.gutenberg.org/cache/epub/1695/pg1695.txt')
bks_gutenberg.append(r.text)

# The Ball and the Cross
r = requests.get(r'https://www.gutenberg.org/cache/epub/5265/pg5265.txt')
bks_gutenberg.append(r.text)

# ########################
# SHakespeare
# ########################
# As You Like It
r = requests.get(r'https://www.gutenberg.org/cache/epub/1786/pg1786.txt')
bks_gutenberg.append(r.text)

# Caesar
r = requests.get(r'https://www.gutenberg.org/cache/epub/2263/pg2263.txt')
bks_gutenberg.append(r.text)

# Hamlet
r = requests.get(r'https://www.gutenberg.org/cache/epub/2265/pg2265.txt')
bks_gutenberg.append(r.text)


In [None]:
# UNIT TEST
#bks_gutenberg[6]
#atotc = requests.get(r'https://www.gutenberg.org/cache/epub/98/pg98.txt')


<Response [200]>

In [None]:
#UNIT TEST
#print(bks_gutenberg[1][:-2000])
substr = 'START OF THE PROJECT GUTENBERG EBOOK'
new_str = bks_gutenberg[1].split(substr,1)[-1]

substr2 = 'END OF THE PROJECT GUTENBERG EBOOK'
new_str2 = new_str.split(substr2,1)[0]
print(new_str2)

In [None]:
# Utility support function
def remove_new_line_tabs(book):
    """remmove unwanted newlines, tabs, etc from the text"""
    for char in ["\n", "\r", "\d", "\t", "\s"]:
        book = book.replace(char, " ")
    return book

In [None]:
len(bks_gutenberg[0])

296579

In [None]:
bks_gutenberg_processed = []

# Process data and get sentence counts
start_of_ebook = 'START OF THE PROJECT GUTENBERG EBOOK'
end_of_ebook   = 'END OF THE PROJECT GUTENBERG EBOOK'

# Clean up header and footer info
for indx in range(len(bks_gutenberg)):
  new_text = bks_gutenberg[indx].split(start_of_ebook,1)[-1]
  new_text = new_text.split(end_of_ebook,1)[0]

  for char in ["\n", "\r", "\d", "\t", "\s"]:
    new_text = new_text.replace(char, " ")

  bks_gutenberg_processed.append(new_text)

In [None]:
# combine each author's works
bks_gutenberg_processed_combined = [' '.join(bks_gutenberg_processed[i:i+3]) for i in range(0, len(bks_gutenberg_processed), 3)]

In [None]:
from nltk.tokenize import sent_tokenize

# ***************************************************************************
# Add Project Gutenberg titles to book list, tokenize sentences
# ***************************************************************************
sens_count = []
bks_gutenberg_sentences = []

df_books = pd.DataFrame({

   'Author':  ['fitzgerald',
           'hemingway',
           'dickens',
           'hardy',
           'austen',
           'chesterton',
           'shakespeare'],
   'Short Title': ['gatsby,this side of paradise,beautiful and damned',
                'sun also rises,men without women,in our time',
                'tale,great expectations,bleak house',
                'mayor,jude,native',
                'emma,sense,pride',
                'wisdom brown,thurday,ball',
                'as you like it,caesar,hamlet'],
   'Title': ['The Great Gatsby,This Side of Paradise,The Beautiful and the Damned',
          'The Sun Also Rises,Men Without Women,In Our Time',
          'A Tale of Two Cities,Great Expectations,Bleak House',
          'The Mayor of Casterbridge,Jude the Obscure,Return of the Native',
          'Emma,Sense and Sensibility,Pride and Prejudice',
          'The Wisdom of Father Brown,The Man Who Was Thursday,The Ball and the Cross',
          'As You Like It,Julius Caesar,Hamlet']
})


for indx in range(len(bks_gutenberg_processed_combined)):
  # Get sentence count
  # returns each sentence as a list of word strings
  sentences = sent_tokenize(bks_gutenberg_processed_combined[indx])
  group_sentences = [' '.join(sentence) for sentence in sentences]
  sens_count.append(len(group_sentences))
  #bks_gutenberg_processed.append(new_text)
  bks_gutenberg_sentences.append(sentences)

df_books['Sentence Count'] = sens_count
bks_gutenberg_sentences = [[string] for string in bks_gutenberg_sentences]

#for book in bks_gutenberg_processed:
#  books.append(book)

In [None]:
len(bks_gutenberg_sentences[0][0])

16014

In [None]:
##########################################
# OLD OLD OLD
##########################################
from nltk.tokenize import sent_tokenize

# ***************************************************************************
# Add Project Gutenberg titles to book list
# ***************************************************************************
bks_gutenberg_processed = []
sens_count = []

df_books = pd.DataFrame({

   'Author':  ['fitzgerald','fitzgerald','fitzgerald',
           'hemingway','hemingway','hemingway',
           'dickens','dickens','dickens',
           'hardy','hardy','hardy',
           'austen','austen','austen',
           'chesterton','chesterton','chesterton',
           'shakespeare','shakespeare','shakespeare'],
   'Short Title': ['gatsby','this side of paradise','beautiful and damned',
                'sun also rises','men without women','in our time',
                'tale','great expectations','bleak house',
                'mayor','jude','native',
                'emma','sense','pride',
                'wisdom brown','thurday','ball',
                'as you like it','caesar','hamlet'],
   'Title': ['The Great Gatsby','This Side of Paradise','The Beautiful and the Damned',
          'The Sun Also Rises','Men Without Women','In Our Time',
          'A Tale of Two Cities','Great Expectations','Bleak House',
          'The Mayor of Casterbridge','Jude the Obscure','Return of the Native',
          'Emma','Sense and Sensibility','Pride and Prejudice',
          'The Wisdom of Father Brown','The Man Who Was Thursday','The Ball and the Cross',
          'As You Like It','Julius Caesar','Hamlet']
})

# Process data and get sentence counts
start_of_ebook = 'START OF THE PROJECT GUTENBERG EBOOK'
end_of_ebook   = 'END OF THE PROJECT GUTENBERG EBOOK'

for indx, row in df_books.iterrows():
  # Clean up header and footer info
  new_text = bks_gutenberg[indx].split(start_of_ebook,1)[-1]
  new_text = new_text.split(end_of_ebook,1)[0]

  for char in ["\n", "\r", "\d", "\t", "\s"]:
    new_text = new_text.replace(char, " ")

  # Get sentence count
  # returns each sentence as a list of word strings
  sentences = sent_tokenize(new_text)
  group_sentences = [' '.join(sentence) for sentence in sentences]
  sens_count.append(len(group_sentences))
  #bks_gutenberg_processed.append(new_text)
  bks_gutenberg_processed.append(sentences)

df_books['Sentence Count'] = sens_count
bks_gutenberg_processed = [[string] for string in bks_gutenberg_processed]

#for book in bks_gutenberg_processed:
#  books.append(book)



# Process Data

In [None]:
#***********************************************
# Create paragraph groups of size chunk_size
#***********************************************
chunk_size = 3
book_groups = []

for i, book in enumerate(bks_gutenberg_sentences):
  combined_sents = []

  for j in range(0, len(book[0]), chunk_size):
    group = book[0][j:j+chunk_size]
    new_str = " ".join(group)
    combined_sents.append(new_str)

  book_groups.append(combined_sents)

In [None]:
# Store in a dataframe
df_books["Sentence Groups"] = book_groups
df_books["Group Counts"] = df_books["Sentence Groups"].apply(lambda x: len(x))
df_books

Unnamed: 0,Author,Short Title,Title,Sentence Count,Sentence Groups,Group Counts
0,fitzgerald,"gatsby,this side of paradise,beautiful and damned","The Great Gatsby,This Side of Paradise,The Bea...",16014,[ THE GREAT GATSBY *** The Great Gats...,5338
1,hemingway,"sun also rises,men without women,in our time","The Sun Also Rises,Men Without Women,In Our Time",9221,[ THE SUN ALSO RISES *** ...,3074
2,dickens,"tale,great expectations,bleak house","A Tale of Two Cities,Great Expectations,Bleak ...",28308,[ A TALE OF TWO CITIES *** A TALE OF TW...,9436
3,hardy,"mayor,jude,native","The Mayor of Casterbridge,Jude the Obscure,Ret...",18374,[ THE MAYOR OF CASTERBRIDGE *** cover ...,6125
4,austen,"emma,sense,pride","Emma,Sense and Sensibility,Pride and Prejudice",14667,[ EMMA *** Emma by Jane Austen ...,4889
5,chesterton,"wisdom brown,thurday,ball","The Wisdom of Father Brown,The Man Who Was Thu...",10134,[ THE WISDOM OF FATHER BROWN *** Produc...,3378
6,shakespeare,"as you like it,caesar,hamlet","As You Like It,Julius Caesar,Hamlet",6472,[ AS YOU LIKE IT *** ********************...,2158


In [None]:
#df_books['Sentence Groups'] = df_books['Sentence Groups'].apply(', '.join)
#df_books_combined = df_books.groupby(df_books.index // 3).agg({
#    'Author': 'first',
#    'Title': lambda x: ', '.join(x),
#    'Sentence Count': 'sum',
#    'Group Counts': 'sum',
#    'Sentence Groups': 'first'
#})

#df_books_combined

In [None]:
# ****************************************************************
# PREPARE DATAFRAME
#
# Random shuffle groups of sentences as a unit, then store the first set
# as Training, second set at Validation, and remaining sentences as
# Testing. Since we shuffle at first, taking Train, Valid, Test sequentially
# is still random.
# ***************************************************************************

# Select Train, Valid, Test split
train_split = 0.7
valid_split = 0.1
test_split  = 0.2

# Create data structure to put into a dataframe
data_train = []
data_valid = []
data_test  = []

# B number of books
for group in book_groups:
  #author = authors[i]
  #short_title = short_titles[i]
  #title = titles[i]

  # passages contains the sentences for book i
  n = len(group)

  train_split_index = int(n*train_split)
  valid_split_index = int(n*valid_split)
  test_split_index  = int(n*test_split)

  # use temp_group as temp store in order to preserve order in book_group[i]
  temp_group = group.copy()
  random.shuffle(temp_group)

  #train_group = book_groups[i][:train_split_index]
  train_group = temp_group[:train_split_index]
  valid_group = temp_group[train_split_index:(train_split_index+valid_split_index)]
  test_group  = temp_group[train_split_index+valid_split_index:]

  data_train.append(train_group)
  data_valid.append(valid_group)
  data_test.append(test_group)

df_books["Train"] = data_train
df_books["Valid"] = data_valid
df_books["Test"]  = data_test
df_books

Unnamed: 0,Author,Short Title,Title,Sentence Count,Sentence Groups,Group Counts,Train,Valid,Test
0,fitzgerald,"gatsby,this side of paradise,beautiful and damned","The Great Gatsby,This Side of Paradise,The Bea...",16014,[ THE GREAT GATSBY *** The Great Gats...,5338,[Under the glass portcullis of a theatre Amory...,"[“What?” Confused, he stared at us as we laugh...","[The house, its furnishings, the manner in wh..."
1,hemingway,"sun also rises,men without women,in our time","The Sun Also Rises,Men Without Women,In Our Time",9221,[ THE SUN ALSO RISES *** ...,3074,"[I figured that all out once, and for six mont...","[“Have another port?” “All right,” said Har...","[He side-stepped, swung the cape in back of hi..."
2,dickens,"tale,great expectations,bleak house","A Tale of Two Cities,Great Expectations,Bleak ...",28308,[ A TALE OF TWO CITIES *** A TALE OF TW...,9436,"[I have stated to the magistrates, ‘Gentlemen...",[I don’t understand these places.” Turning ...,"[Being quite alone, I cried a little again, t..."
3,hardy,"mayor,jude,native","The Mayor of Casterbridge,Jude the Obscure,Ret...",18374,[ THE MAYOR OF CASTERBRIDGE *** cover ...,6125,"[This was Susan herself, occupied in preparin...","[They came in from the country, and the steam...",[The solitary exception was an empty envelope...
4,austen,"emma,sense,pride","Emma,Sense and Sensibility,Pride and Prejudice",14667,[ EMMA *** Emma by Jane Austen ...,4889,[CHAPTER V “I do not know what your opini...,"[Poor Perry is bilious, and he has not time t...",[In hastily forming and giving his opinion of ...
5,chesterton,"wisdom brown,thurday,ball","The Wisdom of Father Brown,The Man Who Was Thu...",10134,[ THE WISDOM OF FATHER BROWN *** Produc...,3378,[But the girl had only brushed Evan's hand wi...,"[And Cray lay in a deck-chair, gasping as for...",[Even at the start I thought he was a bit too...
6,shakespeare,"as you like it,caesar,hamlet","As You Like It,Julius Caesar,Hamlet",6472,[ AS YOU LIKE IT *** ********************...,2158,"[Alas, how is't with you? That you bend your e...","[Guild. O my Lord, if my Dutie be too bold, my...",[Priest. No more be done: We should prophane ...


In [None]:
df_books.to_parquet("gutenberg_corpus_df_3chunk.parquet")
#!mv "nltk_corpus_df_chunks.parquet" "/content/drive/My Drive/w266_Project/ProjectStore/gutenberg_corpus_df_3chunk.parquet"
!mv "gutenberg_corpus_df_3chunk.parquet" "/content/drive/My Drive/w266/data/gutenberg_corpus_df_3chunk.parquet"

In [None]:
# Test parquet file retrieval
# read into a pyarrow table
# NOTE: list arrays before store get converted to numpy.ndarrays after recalling from Drive
table = pya.parquet.read_table("/content/drive/My Drive/w266_Project/ProjectStore/nltk_corpus_df_chunks.parquet")
df = table.to_pandas()
df

# PREPARE BINARY CLASS DATASETS

In [None]:
# Prepare data for binary classification model
def create_bin_data(df,index):

  train = []
  valid = []
  test  = []
  list_of_authors = ['fitzgerald','hemingway','dickens','hardy','austen','chesterton','shakespeare']

  for indx, row in df.iterrows():
    if indx == index:
      label_train = [1]*len(row["Train"])
      label_valid = [1]*len(row["Valid"])
      label_test = [1]*len(row["Test"])
    else:
      label_train = [0]*len(row["Train"])
      label_valid = [0]*len(row["Valid"])
      label_test = [0]*len(row["Test"])

    zipped_train = list(zip(row["Train"],label_train))
    zipped_valid = list(zip(row["Valid"],label_valid))
    zipped_test = list(zip(row["Test"],label_test))

    train.append(zipped_train)
    valid.append(zipped_valid)
    test.append(zipped_test)

  #flatten the list using list comprehension then shuffle
  train_shuffled = [item for sublist in train for item in sublist]
  random.shuffle(train_shuffled)

  valid_shuffled = [item for sublist in valid for item in sublist]
  random.shuffle(valid_shuffled)

  test_shuffled = [item for sublist in test for item in sublist]
  random.shuffle(test_shuffled)

  df_binary_data_train = pd.DataFrame(train_shuffled, columns=['Train Data','Train Label'])
  df_binary_data_valid = pd.DataFrame(valid_shuffled, columns=['Valid Data','Valid Label'])
  df_binary_data_test  = pd.DataFrame(test_shuffled,  columns=['Test Data' ,'Test Label'])

  return(df_binary_data_train, df_binary_data_valid, df_binary_data_test)


In [None]:
df_binary_data_train, df_binary_data_valid, df_binary_data_test = create_bin_data(df_books,0)
df_binary_data_train.to_parquet("gut_corpus_train_data_binary0.parquet")
df_binary_data_valid.to_parquet("gut_corpus_valid_data_binary0.parquet")
df_binary_data_test.to_parquet("gut_corpus_test_data_binary0.parquet")
!mv "gut_corpus_train_data_binary0.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_binary0.parquet"
!mv "gut_corpus_valid_data_binary0.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_binary0.parquet"
!mv "gut_corpus_test_data_binary0.parquet"  "/content/drive/My Drive/w266/data/gut_corpus_test_data_binary0.parquet"

df_binary_data_train, df_binary_data_valid, df_binary_data_test = create_bin_data(df_books,1)
df_binary_data_train.to_parquet("gut_corpus_train_data_binary1.parquet")
df_binary_data_valid.to_parquet("gut_corpus_valid_data_binary1.parquet")
df_binary_data_test.to_parquet("gut_corpus_test_data_binary1.parquet")
!mv "gut_corpus_train_data_binary1.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_binary1.parquet"
!mv "gut_corpus_valid_data_binary1.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_binary1.parquet"
!mv "gut_corpus_test_data_binary1.parquet"  "/content/drive/My Drive/w266/data/gut_corpus_test_data_binary1.parquet"

df_binary_data_train, df_binary_data_valid, df_binary_data_test = create_bin_data(df_books,2)
df_binary_data_train.to_parquet("gut_corpus_train_data_binary2.parquet")
df_binary_data_valid.to_parquet("gut_corpus_valid_data_binary2.parquet")
df_binary_data_test.to_parquet("gut_corpus_test_data_binary2.parquet")
!mv "gut_corpus_train_data_binary2.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_binary2.parquet"
!mv "gut_corpus_valid_data_binary2.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_binary2.parquet"
!mv "gut_corpus_test_data_binary2.parquet"  "/content/drive/My Drive/w266/data/gut_corpus_test_data_binary2.parquet"

df_binary_data_train, df_binary_data_valid, df_binary_data_test = create_bin_data(df_books,3)
df_binary_data_train.to_parquet("gut_corpus_train_data_binary3.parquet")
df_binary_data_valid.to_parquet("gut_corpus_valid_data_binary3.parquet")
df_binary_data_test.to_parquet("gut_corpus_test_data_binary3.parquet")
!mv "gut_corpus_train_data_binary3.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_binary3.parquet"
!mv "gut_corpus_valid_data_binary3.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_binary3.parquet"
!mv "gut_corpus_test_data_binary3.parquet"  "/content/drive/My Drive/w266/data/gut_corpus_test_data_binary3.parquet"

df_binary_data_train, df_binary_data_valid, df_binary_data_test = create_bin_data(df_books,4)
df_binary_data_train.to_parquet("gut_corpus_train_data_binary4.parquet")
df_binary_data_valid.to_parquet("gut_corpus_valid_data_binary4.parquet")
df_binary_data_test.to_parquet("gut_corpus_test_data_binary4.parquet")
!mv "gut_corpus_train_data_binary4.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_binary4.parquet"
!mv "gut_corpus_valid_data_binary4.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_binary4.parquet"
!mv "gut_corpus_test_data_binary4.parquet"  "/content/drive/My Drive/w266/data/gut_corpus_test_data_binary4.parquet"

df_binary_data_train, df_binary_data_valid, df_binary_data_test = create_bin_data(df_books,5)
df_binary_data_train.to_parquet("gut_corpus_train_data_binary5.parquet")
df_binary_data_valid.to_parquet("gut_corpus_valid_data_binary5.parquet")
df_binary_data_test.to_parquet("gut_corpus_test_data_binary5.parquet")
!mv "gut_corpus_train_data_binary5.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_binary5.parquet"
!mv "gut_corpus_valid_data_binary5.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_binary5.parquet"
!mv "gut_corpus_test_data_binary5.parquet"  "/content/drive/My Drive/w266/data/gut_corpus_test_data_binary5.parquet"

df_binary_data_train, df_binary_data_valid, df_binary_data_test = create_bin_data(df_books,6)
df_binary_data_train.to_parquet("gut_corpus_train_data_binary6.parquet")
df_binary_data_valid.to_parquet("gut_corpus_valid_data_binary6.parquet")
df_binary_data_test.to_parquet("gut_corpus_test_data_binary6.parquet")
!mv "gut_corpus_train_data_binary6.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_binary6.parquet"
!mv "gut_corpus_valid_data_binary6.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_binary6.parquet"
!mv "gut_corpus_test_data_binary6.parquet"  "/content/drive/My Drive/w266/data/gut_corpus_test_data_binary6.parquet"

In [None]:

#df_binary_data_test.to_parquet(model_filename)
#!mv $model_filename "/content/drive/My Drive/w266/"

test_shuffled[15]


('Both of which,” said Joe, quite charmed  with his logical arrangement, “being done, now this to you a true  friend, say. Namely. You mustn’t go a overdoing on it, but you must  have your supper and your wine and water, and you must be put betwixt  the sheets.”    The delicacy with which Joe dismissed this theme, and the sweet tact  and kindness with which Biddy—who with her woman’s wit had found me out  so soon—had prepared him for it, made a deep impression on my mind.',
 2)

# PREPARE MULICLASS DATASETS

In [None]:
# ********************************************************
# Prepare data for multiclass classification model
#
# author location 0, 1, 2 -> label as 0
#
#
# ********************************************************
train = []
valid = []
test  = []

for indx, row in df_books.iterrows():
  label_train = [indx]*len(row["Train"])
  label_valid = [indx]*len(row["Valid"])
  label_test  = [indx]*len(row["Test"])

  zipped_train = list(zip(row["Train"],label_train))
  zipped_valid = list(zip(row["Valid"],label_valid))
  zipped_test = list(zip(row["Test"],label_test))
  train.append(zipped_train)
  valid.append(zipped_valid)
  test.append(zipped_test)

#flatten the list using list comprehension then shuffle
train_shuffled = [item for sublist in train for item in sublist]
random.shuffle(train_shuffled)

valid_shuffled = [item for sublist in valid for item in sublist]
random.shuffle(valid_shuffled)

test_shuffled = [item for sublist in test for item in sublist]
random.shuffle(test_shuffled)

df_multi_data_train = pd.DataFrame(train_shuffled, columns=['Train Data','Train Label'])
df_multi_data_valid = pd.DataFrame(valid_shuffled, columns=['Valid Data','Valid Label'])
df_multi_data_test  = pd.DataFrame(test_shuffled,  columns=['Test Data' ,'Test Label'])


In [None]:
# Save to Drive
df_multi_data_train.to_parquet("gut_corpus_train_data_multi.parquet")
df_multi_data_valid.to_parquet("gut_corpus_valid_data_multi.parquet")
df_multi_data_test.to_parquet("gut_corpus_test_data_multi.parquet")

#!mv "gut_corpus_train_data_multi.parquet" "/content/drive/My Drive/w266/gut_corpus_train_data_multi.parquet"
#!mv "gut_corpus_valid_data_multi.parquet" "/content/drive/My Drive/w266/gut_corpus_valid_data_multi.parquet"
#!mv "gut_corpus_test_data_multi.parquet" "/content/drive/My Drive/w266/gut_corpus_test_data_multi.parquet"
!mv "gut_corpus_train_data_multi.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_multi.parquet"
!mv "gut_corpus_valid_data_multi.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_multi.parquet"
!mv "gut_corpus_test_data_multi.parquet" "/content/drive/My Drive/w266/data/gut_corpus_test_data_multi.parquet"


# PREPARE MULTICLASS BALANCED DATASETS
## requires portions of previous sections

In [None]:
# we're going to sort the tuple by the second value which is an integer indicating author
# then from the sorted data (which should be shuffled in terms of order of sentences from any given novel)
# select a max number of sentences not greater than the smallest novel
# Then reshuffle and store the training and validation data. Test data can remain at the larger size.

# Find smallest group count, take percentage factor of that amount
#MIN_GROUP_COUNT_TRAIN = np.minimum(int(np.min(df_books['Group Counts'])*train_split),100)
sample_factor = 0.2
MIN_GROUP_COUNT_TRAIN = int(sample_factor * np.min(df_books['Group Counts'])*train_split)
MIN_GROUP_COUNT_VALID = int(sample_factor * np.min(df_books['Group Counts'])*valid_split)
NUM_OF_LABELS = 7

# Sort by label
sorted_train = sorted(train_shuffled, key=lambda x: x[1])
sorted_valid = sorted(valid_shuffled, key=lambda x: x[1])

train_balanced = []
valid_balanced = []

for indx in range(NUM_OF_LABELS):
  train_balanced.extend([item for item in sorted_train if item[1] == indx][:MIN_GROUP_COUNT_TRAIN])
  valid_balanced.extend([item for item in sorted_valid if item[1] == indx][:MIN_GROUP_COUNT_VALID])

# shuffle labels
random.shuffle(train_balanced)
random.shuffle(valid_balanced)

In [None]:
MIN_GROUP_COUNT_TRAIN

302

In [None]:
df_multi_data_train = pd.DataFrame(train_balanced, columns=['Train Data','Train Label'])
df_multi_data_valid = pd.DataFrame(valid_balanced, columns=['Valid Data','Valid Label'])
df_multi_data_test  = pd.DataFrame(test_shuffled,  columns=['Test Data' ,'Test Label'])

df_multi_data_train.to_parquet("gut_corpus_train_data_multi_bal.parquet")
df_multi_data_valid.to_parquet("gut_corpus_valid_data_multi_bal.parquet")
df_multi_data_test.to_parquet("gut_corpus_test_data_multi_bal.parquet")

!mv "gut_corpus_train_data_multi_bal.parquet" "/content/drive/My Drive/w266/data/gut_corpus_train_data_multi_bal.parquet"
!mv "gut_corpus_valid_data_multi_bal.parquet" "/content/drive/My Drive/w266/data/gut_corpus_valid_data_multi_bal.parquet"
!mv "gut_corpus_test_data_multi_bal.parquet" "/content/drive/My Drive/w266/data/gut_corpus_test_data_multi_bal.parquet"