In [1]:
import spacy
import glob
from collections import Counter

In [2]:
filenames = glob.glob('test/*.txt')
texts = list()
for fname in filenames:
    with open(fname,'r') as f:
        texts.append(f.read())
print('read', len(texts), 'texts.')

read 10 texts.


In [3]:
single_text = texts[0]
print(single_text[:48])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I



## Parse a Single Document

Now, we will import a text model using spacy. The [spacy install instructions](https://spacy.io/usage) are a bit more complicated than other software, but the software is extremely powerful when installed.
For me, the model installation required only one command:

python -m spacy download en

In [4]:
nlp = spacy.load('en')
nlp

<spacy.lang.en.English at 0x7f9e982cdf28>

In [5]:
doc = nlp(single_text)
tokens = [t for t in doc]
print(tokens[:5])
token_str = [t.text for t in doc]
print(token_str[:5])

[[, Emma, by, Jane, Austen]
['[', 'Emma', 'by', 'Jane', 'Austen']


## Lemmas
Lemmas are essentially base words. See a full explanation on the [Stanford NLP page](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html).

In [6]:
lemmas = [t.lemma_ for t in doc]
print(lemmas[:100])
print(token_str[:100])

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', '\n\n', 'volume', '-PRON-', '\n\n', 'chapter', '-PRON-', '\n\n\n', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', '\n', 'and', 'happy', 'disposition', ',', 'seem', 'to', 'unite', 'some', 'of', 'the', 'good', 'blessing', '\n', 'of', 'existence', ';', 'and', 'have', 'live', 'nearly', 'twenty', '-', 'one', 'year', 'in', 'the', 'world', '\n', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', '-PRON-', '.', '\n\n', '-PRON-', 'be', 'the', 'young', 'of', 'the', 'two', 'daughter', 'of', 'a', 'most', 'affectionate', ',', '\n', 'indulgent', 'father', ';', 'and', 'have', ',', 'in', 'consequence', 'of', '-PRON-', 'sister', "'s", 'marriage', ',', '\n', 'be', 'mistress', 'of', '-PRON-']
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', '\n\n', 'VOLUME', 'I', '\n\n', 'CHAPTER', 'I', '\n\n\n', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with',

## Named Entity Recognition

In [7]:
ents = [e for e in doc.ents]
ent_str = [e.text for e in doc.ents]
#ent_types = [(e.text,e.ent_type_) for e in doc.ents]
ent_cts = Counter(ent_str)
sort_ents = list(sorted(ent_cts.items(), key=lambda x: x[1]))
sort_ents[-10:]
#print(ent_str[:5])
#print(ent_types[:5])

[('two', 127),
 ('one', 143),
 ('first', 169),
 ('Jane', 184),
 ('Woodhouse', 287),
 ('Knightley', 315),
 ('Elton', 370),
 ('Weston', 424),
 ('Harriet', 438),
 ('Emma', 799)]