# Imports

In [1]:
import os

import numpy as np
import pandas as pd

from collections import Counter
from sklearn.neighbors import KernelDensity
from scipy.stats import iqr

# Natural Language Processing
import nltk as nl
from nltk.corpus import stopwords
import regex as re
import spacy
import en_core_web_sm
from spacy.parts_of_speech import PROPN

# Charts
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
folder = 'source'

# Functions

In [3]:
def load_text(file):
    '''
    Opens file by name, reads text into a string and 
    then closes that file. Returns a string of full text.
    
    File must be in .txt format.
    '''
    
    path = './{}/{}'.format(folder, file) # Folder set in imports
    fragments = ['\n', '\ufeff']
    
    s = open(path, 'r').read().strip()
    
    for item in fragments:
        s = s.replace(item, ' ')
    
    return s

In [4]:
def update_author_dict(dictionary, file):
    '''
    Update the dictionary with passed file. Fills the author, 
    title, and text of dictionary.
    '''

    # Split the title from the filepath at the hyphen
    author = file.split('-')[0]
    # Split the author name from the filepath after the hyphen
    title = file.split('-')[1]
    # Remove the .txt from the author name
    title = title.split('.')[0]
    # Call the load text function on the given file
    text = load_text(file)
    
    # Enter text from file with respective author and title
    if author in dictionary:
        dictionary[author].update({title:text})
    else:
        dictionary.update({author: {title: text}})

In [5]:
def mask_proper_nouns(s):
    '''
    Finds proper nouns in row, masks the nouns with nnnn
    then returns the rows
    '''
    
    nlp = spacy.load('en_core_web_sm')
    s = nlp(s)
    
    tokens = []
    
    for token in s:
        if token.pos_ == 'PROPN':
            tokens.append('nnnn')
        else:
            tokens.append(token.text)
    
    return ' '.join(tokens)

In [6]:
def clean_text(s):
    '''
    Convert special quotes and apostrophes to basic quotes and apostrophes,
    select only text and symbols specified by the regex below, and finally
    replace any quotes with mask strings of 'qqqq'
    '''
    s = s.replace('”', '"')
    s = s.replace('“', '"')
    s = s.replace('’', "'")
#     s = re.sub("\".*?\"", ' ', s)
#     s = re.sub("[^A-z _.,;:\\p{Pd}!'\"]", ' ', s) #\\p{Pd} should pick up all types of dash chars
    
    return s

In [7]:
def char_only(row):
    '''
    Uses regex to keep only characters in given text
    '''
    return re.sub("[^A-z]", ' ', row).lower()

In [8]:
def get_sentence_lengths(author):
    lengths = []
    
    [lengths.append(len(row.split())) for row in data[data['author'] == author]['sentence']]
    
    return lengths

In [9]:
def get_kde_output(X_plot, series, bandwidth):
    '''
    Creates plottable y-axis from KDE
    '''
    X = np.array(series).reshape(-1, 1)
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(X)
    log_dens = kde.score_samples(X_plot)
    
    return np.exp(log_dens)

In [10]:
def word_count(word_list, author, target):
    '''
    
    '''
    word_count = 0
    
    for word in word_list:
        if word[1] == author:
            if word[0] == target:
                word_count += 1
            
    return word_count

In [11]:
def get_noun_sequence(df, author):
    target = 'nnnn'
    
    sequence = []
    
    for sentence in df[df['author']==author]['sentence']:
        for word in sentence.split():
            if word == target:
                sequence.append(1)
            else:
                sequence.append(0)
            
    return sequence

# Read-in Data
Read .txt files into strings for each novel

In [12]:
author_dict = {}

path = f'./{folder}'
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

for file in files[1:]:
    update_author_dict(author_dict, file)

# Processing

NLTK calls the PUNKT unsupervised model for splitting a string into sentences. This output is saved as *_sentences that will be labeled and later, further tokenized.

This process is done on every title for every author in the author_dict.

In [13]:
for author in author_dict:
    for title in author_dict[author]:
        author_dict[author][title] = clean_text(author_dict[author][title])
        author_dict[author][title] = mask_proper_nouns(author_dict[author][title])
        author_dict[author][title] = nl.sent_tokenize(author_dict[author][title])

Create DataFrame, data, that will hold set of random sentences for model training.

In [25]:
# This parameter controls the minimum character count to include a sentence
min_sentence_length = 60
# Instantiate DataFrame to hold set of sentences to train model on
data = pd.DataFrame()

# Loop through all authors and their works, select random sentences
# Add these random sentences to data
for author in author_dict: 
    for title in author_dict[author]:
        # Create temp DataFrame work with before adding to data
        temp = pd.DataFrame(author_dict[author][title], columns=['sentence'])
        temp['author'] = author
        temp['title'] = title
        
        # Only keep sentences equal to or longer than the specified length
        temp = temp[temp['sentence'].str.len() > min_sentence_length].copy()
        
        # Split sentences over specified length
        
        
        # Concatenate the temp df onto the primary DataFrame
        data = pd.concat([data, temp], axis=0, ignore_index=True)
        
# Force all sentences to be lowercase
data['sentence'] = data['sentence'].map(char_only)

# Find the number of rows in the limiting class
row_limit = min(data['author'].value_counts())

# Balance classes
drop_indices = []

for author in data['author'].unique():
    # Count of rows in current author class
    class_count = len(data[data['author'] == author].values)
    # Calculate rows to drop as the difference between current class and limiting class
    drop_count = class_count - row_limit
    drop_indices.extend(np.random.choice(data[data['author'] == author].index, 
                                         size=drop_count, 
                                         replace=False))

# Drop the randomly chosen indices to balance classes
data.drop(data.index[drop_indices], inplace=True)

# Sanity check value vount
data['author'].value_counts()

woolf        6348
nabokov      6348
vonnegut     6348
hemingway    6348
Name: author, dtype: int64

# Exploratory Data Analysis

In [15]:
data.head()

Unnamed: 0,sentence,author,title
0,In the late summer of that year we lived in a ...,hemingway,a farewell to arms
1,In the bed of the river there were pebbles and...,hemingway,a farewell to arms
2,Troops went by the house and down the road and...,hemingway,a farewell to arms
3,The trunks of the trees too were dusty and the...,hemingway,a farewell to arms
4,The plain was rich with crops ; there were man...,hemingway,a farewell to arms


In [18]:
data.shape[0]

78212

#### Create word list for use in EDA

In [19]:
# Words are stored with their authors in a tuple
word_list = []
for index, row in enumerate(data['sentence']):
    
    for word in row.split():
        entry = (word, data['author'][index])
    
        word_list.append(entry)

#### Create author list

In [None]:
# List of authors to be used in EDA loops
authors = ['hemingway', 'nabokov', 'vonnegut', 'woolf']

#### Sentence length analysis

In [20]:
hmn_sentence_lengths = get_sentence_lengths('hemingway')
nbk_sentence_lengths = get_sentence_lengths('nabokov')
vng_sentence_lengths = get_sentence_lengths('vonnegut')
wlf_sentence_lengths = get_sentence_lengths('woolf')

In [21]:
# Max needed was 256 from virginia woolf
X_sent = np.array(np.linspace(0, 256, 1000)).reshape(-1, 1)

In [53]:
# Used to create KDE for sentence length distribution
y_plot = get_kde_output(X_sent, wlf_sentence_lengths, 5.0)

#### Color analysis

In [126]:
color_list = ['white', 'black', 'red', 'blue', 'green', 'yellow', 'purple', 'orange']

for author in authors:
    for color in color_list:
        print(f'{author} {color} {word_count(word_list, author, color)}')

hemingway white 143
hemingway black 50
hemingway red 57
hemingway blue 54
hemingway green 50
hemingway yellow 44
hemingway purple 21
hemingway orange 3
nabokov white 306
nabokov black 460
nabokov red 240
nabokov blue 266
nabokov green 191
nabokov yellow 65
nabokov purple 41
nabokov orange 37
vonnegut white 109
vonnegut black 60
vonnegut red 44
vonnegut blue 78
vonnegut green 45
vonnegut yellow 45
vonnegut purple 6
vonnegut orange 19
woolf white 283
woolf black 129
woolf red 193
woolf blue 148
woolf green 191
woolf yellow 135
woolf purple 56
woolf orange 16


#### Word length analysis

In [206]:
hmn_word_lengths = []

for word in word_list:
    if word[1] == 'hemingway':
        hmn_word_lengths.append(len(word[0]))

In [207]:
nbk_word_lengths = []

for word in word_list:
    if word[1] == 'nabokov':
        nbk_word_lengths.append(len(word[0]))

In [208]:
vng_word_lengths = []

for word in word_list:
    if word[1] == 'vonnegut':
        vng_word_lengths.append(len(word[0]))

In [209]:
wlf_word_lengths = []

for word in word_list:
    if word[1] == 'woolf':
        wlf_word_lengths.append(len(word[0]))

In [173]:
# Bounded by 0 to the largest word at 18 chars
X_word = np.array(np.linspace(0, 18, 1000)).reshape(-1, 1)

In [216]:
# Used to create KDE for word length distribution
y_plot = get_kde_output(X_word, wlf_word_lengths, 5.0)

#### Getting counts of nouns

In [23]:
for author in authors:
    nouns = []
    
    for row in data[data['author'] == author]['sentence']:
        for word in row.split():
            if word == 'nnnn':
                nouns.append(word)
                
    print(f'{author} total nouns: {len(nouns)}')

hemingway total nouns: 8326
nabokov total nouns: 22413
vonnegut total nouns: 16109
woolf total nouns: 11812


In [24]:
# Divide by authors word count for rate
8326/317265

0.026243046034072463

#### Get totals for scaling charts

In [213]:
for author in authors:
    total_sents = []

    for row in data[data['author'] == author]['sentence']:
        total_sents.append(row)

    print(f'{author} total sentences: {len(total_sents)}')

hemingway total sentences: 26661
nabokov total sentences: 17504
vonnegut total sentences: 18172
woolf total sentences: 15560


In [68]:
for author in authors:
    total_words = []

    for row in data[data['author'] == author]['sentence']:
        for word in row.split():
            total_words.append(word)

    print(f'{author} total words: {len(total_words)}')

hemingway total words: 317265
nabokov total words: 506732
vonnegut total words: 274263
woolf total words: 332285


# Output
Save the DataFrame to a .csv file to be read by the modeling notebook

In [2]:
# file = './REPLACE.csv'

# data.to_csv(file)