In [1]:
# setting the random seed for reproducibility
import random
random.seed(493)

# for manipulating dataframes
import pandas as pd
import numpy as np

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
lines = open('../data/in/emily_dickinson_first_lines.txt', 'r')

line_list = []
for line in lines:
    line_list.append(line)

In [3]:
line_list[:10]

['A bird came down the walk:\n',
 'A charm invests a face\n',
 "A clock stopped â€” not the mantel's;\n",
 'A death-blow is a life-blow to some\n',
 'A deed knocks first at thought,\n',
 'A dew sufficed itself\n',
 'A door just opened on a street â€”\n',
 'A drop fell on the apple tree,\n',
 'A face devoid of love or grace,\n',
 'A lady red upon the hill\n']

In [4]:
lines = open('../data/in/emily_dickinson_first_lines.txt', 'r')

line_list = []
for line in lines:
    line_list.append(line.strip('\n'))

In [5]:
line_list[:10]

['A bird came down the walk:',
 'A charm invests a face',
 "A clock stopped â€” not the mantel's;",
 'A death-blow is a life-blow to some',
 'A deed knocks first at thought,',
 'A dew sufficed itself',
 'A door just opened on a street â€”',
 'A drop fell on the apple tree,',
 'A face devoid of love or grace,',
 'A lady red upon the hill']

In [6]:
ADDITIONAL_STOPWORDS = ['â€”', 'ignore']

def clean(sentence):
    """
    Takes a string and returns a list of cleaned words. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.

            Parameters:
                    sentence (str): A list of words

            Returns:
                    word_list (list): A list of cleaned words
    """
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    sentence = (unicodedata.normalize('NFKD', sentence)
        .encode('ascii', 'ignore')
        .decode('utf-8', 'ignore')
        .lower())
    words = re.sub(r'[^\w\s]', '', sentence).split()
    word_list = [wnl.lemmatize(word) for word in words if word not in stopwords]
    return word_list

In [7]:
clean('The quick brown fox jumps â€” over the lazy dog. Ignore me.')

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']

In [8]:
def get_words(line_list):
    """
    Takes a dataframe and a column and returns a list of
    cleaned words that is returned by clean().

            Parameters:
                    df (dataframe): A pandas dataframe
                    column (series): A pandas series

            Returns:
                    word_list (list): A list of cleaned words
    """
    return clean(''.join(str(line_list)))

In [9]:
all_words = get_words(line_list)

In [10]:
type(all_words)

list

In [11]:
df = pd.DataFrame(pd.Series(all_words))

In [12]:
type(df)

pandas.core.frame.DataFrame

In [13]:
df.head()

Unnamed: 0,0
0,bird
1,came
2,walk
3,charm
4,invests


In [14]:
df.columns=['word']

In [15]:
df.head()

Unnamed: 0,word
0,bird
1,came
2,walk
3,charm
4,invests


### _What are the most frequently occuring words?_

In [16]:
df.word.value_counts()[:20]

little     17
like       16
one        13
day        11
know       10
went       10
heart      10
night      10
life        9
time        9
death       9
never       9
summer      9
heaven      8
come        8
bird        8
year        8
died        7
face        7
thought     7
Name: word, dtype: int64

In [17]:
def get_bigrams(words):
    """
    Takes in a list of words and returns a series of
    bigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 2)).value_counts())[:20]

def get_trigrams(words):
    """
    Takes in a list of words and returns a series of
    trigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 3)).value_counts())[:20]

def get_qualgrams(words):
    """
    Takes in a list of words and returns a series of
    qualgrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 4)).value_counts())[:20]

def get_ngrams(words):
    """
    Takes in a dataframe with column name and generates a
    dataframe of unigrams, bigrams, trigrams, and qualgrams.
    """
    return get_bigrams(words).to_frame().reset_index().rename(columns={'index':'bigram','0':'count'}), \
           get_trigrams(words).to_frame().reset_index().rename(columns={'index':'trigram','0':'count'}), \
           get_qualgrams(words).to_frame().reset_index().rename(columns={'index':'qualgram','0':'count'})

### _Do the first lines vary greatly?_

In [18]:
get_ngrams(all_words)

(               bigram  0
 0     (little, heart)  2
 1    (except, heaven)  2
 2         (see, face)  2
 3       (summer, day)  2
 4       (every, bird)  2
 5     (dropped, like)  2
 6       (never, know)  2
 7      (little, boat)  2
 8         (much, joy)  2
 9       (wild, night)  2
 10       (many, time)  2
 11      (never, lost)  2
 12       (never, saw)  2
 13      (lip, divine)  1
 14         (sea, one)  1
 15       (honey, rat)  1
 16   (started, early)  1
 17  (world, occasion)  1
 18        (god, gave)  1
 19         (got, eye)  1,
                               trigram  0
 0   (little, implement, presentiment)  1
 1                (face, lived, dread)  1
 2                   (made, man, long)  1
 3             (sunrise, little, maid)  1
 4               (repeat, summer, day)  1
 5               (poor, little, heart)  1
 6                   (like, star, say)  1
 7               (come, back, dropped)  1
 8                (summer, bird, fate)  1
 9                (didst, break, 