In [None]:

%run -m spacy download en_core_web_sm

In [1]:
import os
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy
import string
import pandas as pd
punctuations = string.punctuation
from spacy.lang.en import English
parser = English()
from spacy.lang.en.stop_words import STOP_WORDS

from IPython.display import display, HTML
display(HTML(data="""
<style> div#notebook-container { width: 95%; } div#menubar-container { width: 95%; } div#maintoolbar-container { width: 99%; } </style>

"""))
nlp.max_length = 21000000
parser.max_length = 21000000

In [2]:
# Creating our tokenzer function
def spacy_tokenizer(sentence):
    """This function will accepts a sentence as input and processes the sentence into tokens, performing lemmatization, 
    lowercasing, removing stop words and punctuations."""
    
    # Creating our token object which is used to create documents with linguistic annotations
    mytokens = parser(sentence)
    
    # lemmatizing each token and converting each token in lower case
    # Note that spaCy uses '-PRON-' as lemma for all personal pronouns lkike me, I etc
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in STOP_WORDS and word not in punctuations]
    
    # Return preprocessed list of tokens
    return mytokens    


def create_word_counts_by_pos(raw_text, list_of_pos, word_count_dict_input = None):
    """
    takes a raw text file
    tokenizes and lemmatizes it
    limits inspection to list_of_pos types of words
    counts the individual lemmas
    returns a dictionary, keys are pos's in list_of_pos
    values are dictinaries with word counts
    """

    doc = nlp(raw_text)

    if word_count_dict_input is None: 
        word_count_dict = {}
        for part_of_speech in list_of_pos:
            word_count_dict[part_of_speech] = {}
    else:
        word_count_dict = word_count_dict_input

    for token in doc: 
        part_of_speech = token.pos_

        if part_of_speech in list_of_pos and token.is_stop == False:
            word_lemma = token.lemma_
            current_count = word_count_dict[part_of_speech].get(word_lemma, 0)
            current_count += 1
            word_count_dict[part_of_speech][word_lemma] = current_count

    return word_count_dict

# Basic function to clean the text
def clean_text(text):
    """Removing spaces and converting the text into lowercase"""
    return text.strip().lower()    

In [3]:
path = '../data/plaintext-example-files/'
os.listdir('../data/plaintext-example-files')


['adjs.txt',
 'austen.txt',
 'frankenstein.txt',
 'frost.txt',
 'genesis.txt',
 'nouns.txt',
 'plain_text_wikipedia.txt',
 'README.md',
 'samatar.txt',
 'sea_rose.txt',
 'sonnets.txt',
 'sowpods.txt',
 'verbs.txt']

In [4]:
with open(path+'genesis.txt') as infile:
    infile = infile.read()
    doc = nlp(infile)
doc

In the beginning God created the heaven and the earth. 
And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters. 
And God said, Let there be light: and there was light. 
And God saw the light, that it was good: and God divided the light from the darkness. 
And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day. 
And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters. 
And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so. 
And God called the firmament Heaven. And the evening and the morning were the second day. 
And God said, Let the waters under the heaven be gathered together unto one place, and let the dry land appear: and it was so. 
And God called the dry land Earth; and the gathering together

In [5]:
displacy.render(doc[0:1000], style="ent")

In [16]:
doc[0:500]

PRIDE AND PREJUDICE

By Jane Austen



Chapter 1


It is a truth universally acknowledged, that a single man in possession
of a good fortune, must be in want of a wife.

However little known the feelings or views of such a man may be on his
first entering a neighbourhood, this truth is so well fixed in the minds
of the surrounding families, that he is considered the rightful property
of some one or other of their daughters.

"My dear Mr. Bennet," said his lady to him one day, "have you heard that
Netherfield Park is let at last?"

Mr. Bennet replied that he had not.

"But it is," returned she; "for Mrs. Long has just been here, and she
told me all about it."

Mr. Bennet made no answer.

"Do you not want to know who has taken it?" cried his wife impatiently.

"_You_ want to tell me, and I have no objection to hearing it."

This was invitation enough.

"Why, my dear, you must know, Mrs. Long says that Netherfield is taken
by a young man of large fortune from the north of England; that he

In [27]:
doc_counts = create_word_counts_by_pos(infile, ['NOUN', 'VERB','ADJ', 'ADV'], word_count_dict_input = None)
df = pd.DataFrame(doc_counts)
nouns = df['NOUN'].sort_values(ascending=False)
verbs = df['VERB'].sort_values(ascending=False)
adjectives = df['ADJ'].sort_values(ascending=False)
adverbs = df['ADV'].sort_values(ascending=False)

In [30]:
pd.DataFrame(verbs)

Unnamed: 0,VERB
let,14.0
say,11.0
see,7.0
yield,5.0
call,5.0
...,...
male,
green,
sixth,
forth,


In [31]:
list(doc.noun_chunks)

[God,
 the heaven,
 the earth,
 the earth,
 form,
 darkness,
 the face,
 the Spirit,
 God,
 the face,
 the waters,
 God,
 God,
 the light,
 it,
 God,
 the light,
 the darkness,
 God,
 the light Day,
 the darkness,
 he,
 the evening,
 the morning,
 the first day,
 God,
 a firmament,
 the midst,
 the waters,
 it,
 the waters,
 the waters,
 God,
 the firmament,
 the waters,
 the firmament,
 the waters,
 the firmament,
 it,
 God,
 the firmament Heaven,
 the evening,
 the morning,
 the second day,
 God,
 the waters,
 the heaven,
 one place,
 the dry land,
 it,
 God,
 the dry land,
 Earth,
 the gathering,
 the waters,
 God,
 it,
 God,
 the earth,
 grass,
 the herb,
 seed,
 the fruit tree,
 fruit,
 his kind,
 whose seed,
 itself,
 the earth,
 it,
 the earth,
 grass,
 seed,
 his kind,
 the tree,
 fruit,
 whose seed,
 itself,
 his kind,
 God,
 it,
 the evening,
 the morning,
 the third day,
 God,
 lights,
 the firmament,
 the heaven,
 the day,
 the night,
 them,
 signs,
 seasons,
 days,
 them,


In [43]:
for s in doc.sents:
    print(s)

In the beginning God created the heaven and the earth. 

And the earth was without form, and void; and darkness was upon the face of the deep.
And the Spirit of God moved upon the face of the waters. 

And God said, Let there be light: and there was light. 

And God saw the light, that it was good: and God divided the light from the darkness. 

And God called the light Day, and the darkness he called Night.
And the evening and the morning were the first day. 

And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters. 

And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so. 

And God called the firmament Heaven.
And the evening and the morning were the second day. 

And God said, Let the waters under the heaven be gathered together unto one place, and let the dry land appear: and it was so. 

And God called the dry land Earth; and the gathering

In [42]:
for e in doc.ents:
    print(e)

God
God
God
the light Day
Night
the evening
the first day
God
God
God
the evening
the second day
God
one
Earth
God
the evening
the third day
God
the day
the night
two
the day
the night
the day
the night
the evening
the fourth day
God
God
the evening
the fifth day
God
God
God
God
Behold
the evening
the sixth day
