In [1]:
# setting the random seed for reproducibility
import random
random.seed(493)

# for manipulating dataframes
import pandas as pd
import numpy as np

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
lines = open('../data/in/emily_dickinson_poems_only.txt', 'r')

line_list = []
for line in lines:
    line_list.append(line)

In [3]:
line_list[:10]

['I.\n',
 '\n',
 'SUCCESS.\n',
 '\n',
 'Success is counted sweetest\n',
 "By those who ne'er succeed.\n",
 'To comprehend a nectar\n',
 'Requires sorest need.\n',
 '\n',
 'Not one of all the purple host\n']

In [4]:
lines = open('../data/in/emily_dickinson_poems_only.txt', 'r')

line_list = []
for line in lines:
    line_list.append(line.strip('\n'))

In [5]:
line_list[:10]

['I.',
 '',
 'SUCCESS.',
 '',
 'Success is counted sweetest',
 "By those who ne'er succeed.",
 'To comprehend a nectar',
 'Requires sorest need.',
 '',
 'Not one of all the purple host']

In [18]:
def remove_roman(regex, text):
    clean_line = text
    clean_line = re.sub(regex, '', clean_line)
    return clean_line

In [19]:
clean_lines = []
for line in line_list:
    clean_lines.append(remove_roman('^(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3}).$', line))

In [20]:
clean_lines

['',
 '',
 'SUCCESS.',
 '',
 'Success is counted sweetest',
 "By those who ne'er succeed.",
 'To comprehend a nectar',
 'Requires sorest need.',
 '',
 'Not one of all the purple host',
 'Who took the flag to-day',
 'Can tell the definition,',
 'So clear, of victory,',
 '',
 'As he, defeated, dying,',
 'On whose forbidden ear',
 'The distant strains of triumph',
 'Break, agonized and clear!',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Our share of night to bear,',
 'Our share of morning,',
 'Our blank in bliss to fill,',
 'Our blank in scorning.',
 '',
 'Here a star, and there a star,',
 'Some lose their way.',
 'Here a mist, and there a mist,',
 'Afterwards â€” day!',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'ROUGE ET NOIR.',
 '',
 'Soul, wilt thou toss again?',
 'By just such a hazard',
 'Hundreds have lost, indeed,',
 'But tens have won an all.',
 '',
 "Angels' breathless ballot",
 'Lingers to record thee;',
 'Imps in eager caucus',
 'Raffle for my soul.',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'R

In [21]:
ADDITIONAL_STOPWORDS = ['â€”']

def clean(sentence):
    """
    Takes a string and returns a list of cleaned words. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.

            Parameters:
                    sentence (str): A list of words

            Returns:
                    word_list (list): A list of cleaned words
    """
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    sentence = (unicodedata.normalize('NFKD', sentence)
        .encode('ascii', 'ignore')
        .decode('utf-8', 'ignore')
        .lower())
    words = re.sub(r'[^\w\s]', '', sentence).split()
    word_list = [wnl.lemmatize(word) for word in words if word not in stopwords]
    return word_list

In [22]:
clean('The quick brown fox jumps â€” over the lazy dog. Ignore me.')

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog', 'ignore']

In [23]:
def get_words(line_list):
    """
    Takes a dataframe and a column and returns a list of
    cleaned words that is returned by clean().

            Parameters:
                    df (dataframe): A pandas dataframe
                    column (series): A pandas series

            Returns:
                    word_list (list): A list of cleaned words
    """
    return clean(''.join(str(line_list)))

In [24]:
all_words = get_words(line_list)

In [25]:
type(all_words)

list

In [26]:
df = pd.DataFrame(pd.Series(all_words))

In [27]:
type(df)

pandas.core.frame.DataFrame

In [28]:
df.head()

Unnamed: 0,0
0,success
1,success
2,counted
3,sweetest
4,neer


In [29]:
df.columns=['word']

In [30]:
df.head()

Unnamed: 0,word
0,success
1,success
2,counted
3,sweetest
4,neer


### _What are the most frequently occuring words?_

In [31]:
df.word.value_counts()[:20]

like      116
one       106
little     91
upon       90
day        87
could      82
know       71
would      65
life       63
away       62
go         58
time       58
till       57
never      55
tell       52
eye        52
night      50
heaven     49
sun        49
summer     48
Name: word, dtype: int64

In [32]:
def get_bigrams(words):
    """
    Takes in a list of words and returns a series of
    bigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 2)).value_counts())[:20]

def get_trigrams(words):
    """
    Takes in a list of words and returns a series of
    trigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 3)).value_counts())[:20]

def get_qualgrams(words):
    """
    Takes in a list of words and returns a series of
    qualgrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 4)).value_counts())[:20]

def get_ngrams(words):
    """
    Takes in a dataframe with column name and generates a
    dataframe of unigrams, bigrams, trigrams, and qualgrams.
    """
    return get_bigrams(words).to_frame().reset_index().rename(columns={'index':'bigram','0':'count'}), \
           get_trigrams(words).to_frame().reset_index().rename(columns={'index':'trigram','0':'count'}), \
           get_qualgrams(words).to_frame().reset_index().rename(columns={'index':'qualgram','0':'count'})

### _Do the first lines vary greatly?_

In [33]:
get_ngrams(all_words)

(                 bigram   0
 0         (summer, day)  10
 1          (could, see)   7
 2       (human, nature)   6
 3        (little, maid)   5
 4       (little, heart)   5
 5        (little, boat)   5
 6      (little, figure)   5
 7         (one, little)   4
 8          (come, back)   4
 9            (one, one)   4
 10         (every, day)   4
 11        (life, death)   4
 12         (never, saw)   4
 13  (thanksgiving, day)   3
 14       (setting, sun)   3
 15        (could, find)   3
 16         (hast, thou)   3
 17         (dont, know)   3
 18        (wild, night)   3
 19        (bring, thee)   3,
                           trigram  0
 0               (red, upon, hill)  2
 1             (one, little, boat)  2
 2     (among, stooping, plucking)  2
 3             (shall, live, vain)  2
 4            (went, thank, slept)  2
 5          (hide, within, flower)  2
 6                (men, made, hay)  2
 7               (fly, upon, pane)  2
 8              (eye, could, find)  2
 9        