# The butterflies of the Florentine Codex

### Creating DataFrame

In [43]:
import pandas as pd
import re

### Sentences with spaCy

In [44]:
import spacy

In [45]:
papalotl = open('papalotl.txt')
butterflies = papalotl.read()

In [46]:
from spacy.lang.en import English

raw_text = butterflies
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]

In [47]:
sentences[:10]

['Whatever the kind of butterfly, it is long and straight, the abdomen is slender, the neck is constricted.',
 'lt is fuzzy, like fat; winged.',
 'Its wings are twofold.',
 'It has arms, it has legs, it has antennae.',
 'lt is a flyer, a constant flyer, a flutterer, a sucker of the different flowers, and a sucker of liquid.',
 'It is fuzzy.',
 'It trembles, it beats its wings together, it constantly flies.',
 'It sucks, it sucks liquid.',
 'It is not solid.',
 'There are many kinds of butterflies.']

In [48]:
# Get rid of newlines
sentences = [item.replace('\n', " ") for item in sentences]

In [49]:
df = pd.DataFrame(sentences) 
df.columns = ['BUTTERFLY']
df[:10]

Unnamed: 0,BUTTERFLY
0,"Whatever the kind of butterfly, it is long and..."
1,"lt is fuzzy, like fat; winged."
2,Its wings are twofold.
3,"It has arms, it has legs, it has antennae."
4,"lt is a flyer, a constant flyer, a flutterer, ..."
5,It is fuzzy.
6,"It trembles, it beats its wings together, it c..."
7,"It sucks, it sucks liquid."
8,It is not solid.
9,There are many kinds of butterflies.


In [50]:
nlp = spacy.load('en_core_web_sm')

In [63]:
doc = nlp(butterflies)

In [66]:
for token in doc:
    print(token, token.pos_, token.dep_, token.lemma_)

Whatever PRON advcl whatever
the DET det the
kind NOUN nsubj kind
of ADP prep of
butterfly NOUN pobj butterfly
, PUNCT punct ,
it PRON nsubj -PRON-
is AUX ccomp be
long ADJ acomp long
and CCONJ cc and
straight ADJ conj straight
, PUNCT punct ,
the DET det the
abdomen NOUN nsubj abdomen
is AUX ccomp be
slender ADJ acomp slender
, PUNCT punct ,
the DET det the
neck NOUN nsubjpass neck
is AUX auxpass be
constricted VERB ROOT constrict
. PUNCT punct .
lt PROPN nsubj lt
is AUX ccomp be
fuzzy ADJ acomp fuzzy
, PUNCT punct ,
like SCONJ prep like
fat NOUN pobj fat
; PUNCT punct ;
winged VERB ROOT wing
. PUNCT punct .
Its PRON poss -PRON-
wings NOUN nsubj wing
are AUX ROOT be
twofold ADJ acomp twofold
. PUNCT punct .
It PRON nsubj -PRON-
has AUX ccomp have
arms NOUN dobj arm
, PUNCT punct ,
it PRON nsubj -PRON-
has AUX ccomp have
legs NOUN dobj leg
, PUNCT punct ,
it PRON nsubj -PRON-
has AUX ROOT have
antennae NOUN dobj antennae
. PUNCT punct .
lt PROPN nsubj lt
is AUX ROOT be
a DET det a
flye

### How many terms?

In [53]:
len(re.findall('papalotl', butterflies))

In [54]:
len(re.findall(r'butterfl\w+', butterflies))

### Where are they?

In [55]:
from spacy.matcher import PhraseMatcher
from spacy import displacy

In [56]:
matcher = PhraseMatcher(nlp.vocab)
papalotl = nlp(butterflies)
phrase_list = ['butterfly', 'papalotl']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('mariposa', None, *phrase_patterns)
found_matches = matcher(papalotl)
print(found_matches)

[(4860986508236918519, 4, 5), (4860986508236918519, 132, 133), (4860986508236918519, 134, 135)]


In [57]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = papalotl[start:end]
    print(start, end, span.text)

4 5 butterfly
132 133 papalotl
134 135 butterfly


### Dependencies

In [58]:
doc = nlp(sentences[0])

In [59]:
displacy.render(doc, style='dep', jupyter=True)

### Part-of-speech tagging

In [60]:
for i in range(len(sentences)):
    doc = nlp(sentences[i])
    pos = [(w.text, w.pos_) for w in doc]
    for j in pos:
        if j[1] == 'ADJ':
            print(j[0])

long
straight
slender
fuzzy
twofold
constant
different
fuzzy
solid
many
large
yellow
yellow
fuzzy
black
beautiful
coveted
desirable
desirable
fragile
firm
similar
same
black
little
little
smoky
tawny
smoky
smoky
yellow
smoky
yellow
smoky
yellow
little
large
average
small
white
whitish
pale
yellow
similar
large
tiny
uniform
pale
pallid
yellow
livid
light
blue
light
blue
blue
large
small
Many
intricate
wonderful
intricate
like
average
beautiful
wonderful
amaranth
mature
ripe
yellow


In [140]:
for i in range(len(sentences)):
    doc = nlp(sentences[i])
    pos = [(w.text, w.pos_) for w in doc]
    for j in pos:
        if j[1] == 'VERB':
            print(j[0])

constricted
winged
trembles
beats
flies
sucks
sucks
comes
painted
varicolored
required
yellows
becomcs
painted
becomes
varicolored
breeds
becomes
develops
flies
flecked
varicolored
TLECOCOZPAPALOTL
called
fireyellow
glows
glistens
glows
glistens
becomes
turns
PAPALOTL
painted
sprinkled
flecked
painted
painted
comes
takes
becomes
varicolored
sought
sought
painted
called
become
named
become
