# The butterflies of the Florentine Codex

### Creating DataFrame

In [1]:
import pandas as pd
import re

### Sentences with spaCy

In [3]:
import spacy

In [4]:
papalotl = open('papalotl.txt')
butterflies = papalotl.read()

In [5]:
from spacy.lang.en import English

raw_text = butterflies
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]

In [6]:
sentences

['BUTTERFLY\nWhatever the kind of butterfly, it is long and straight, the abdomen is slender, the neck is constricted.',
 'lt is fuzzy, like fat; winged.',
 'Its wings are twofold.',
 'It has arms, it has legs, it has antennae.',
 'lt is a flyer, a constant flyer, a flutterer, a sucker of the different flowers, and a sucker of liquid.',
 'It is fuzzy.',
 'It trembles, it beats its wings together, it constantly flies.',
 'It sucks, it sucks liquid.',
 'It is not solid.',
 'There are many kinds of butterflies.',
 'XICALPAPALOTL OR XICALTECONPAPALOTL.',
 'OR XICALTECON\nIt is somewhat large.',
 'Its name comes from xicalli [gourd bowl] and papalotl [butterfly ], because it is yellow, it is quite yellow, it is fuzzy .',
 'And it is painted with black; it is varicolored.',
 'So it is very beautiful, coveted, desirable, constantly desirable, constantly required.',
 'It is fragile.',
 'It yellows, it becomcs painted, it becomes varicolored.',
 'It breeds, it becomes firm, it develops; it flie

In [21]:
# Get rid of newlines
sentences = [item.replace('\n', " ") for item in sentences]
print (sentences)

['BUTTERFLY Whatever the kind of butterfly, it is long and straight, the abdomen is slender, the neck is constricted.', 'lt is fuzzy, like fat; winged.', 'Its wings are twofold.', 'It has arms, it has legs, it has antennae.', 'lt is a flyer, a constant flyer, a flutterer, a sucker of the different flowers, and a sucker of liquid.', 'It is fuzzy.', 'It trembles, it beats its wings together, it constantly flies.', 'It sucks, it sucks liquid.', 'It is not solid.', 'There are many kinds of butterflies.', 'XICALPAPALOTL OR XICALTECONPAPALOTL.', 'OR XICALTECON It is somewhat large.', 'Its name comes from xicalli [gourd bowl] and papalotl [butterfly ], because it is yellow, it is quite yellow, it is fuzzy .', 'And it is painted with black; it is varicolored.', 'So it is very beautiful, coveted, desirable, constantly desirable, constantly required.', 'It is fragile.', 'It yellows, it becomcs painted, it becomes varicolored.', 'It breeds, it becomes firm, it develops; it flies constantly.', 'TL

In [8]:
df = pd.DataFrame(sentences) 
df 

Unnamed: 0,0
0,"BUTTERFLY Whatever the kind of butterfly, it i..."
1,"lt is fuzzy, like fat; winged."
2,Its wings are twofold.
3,"It has arms, it has legs, it has antennae."
4,"lt is a flyer, a constant flyer, a flutterer, ..."
5,It is fuzzy.
6,"It trembles, it beats its wings together, it c..."
7,"It sucks, it sucks liquid."
8,It is not solid.
9,There are many kinds of butterflies.


In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp(butterflies)

In [11]:
for token in doc:
    print(token, token.pos_, token.dep_)

BUTTERFLY PROPN ROOT

 SPACE 
Whatever PRON dep
the DET det
kind NOUN nsubj
of ADP prep
butterfly NOUN pobj
, PUNCT punct
it PRON nsubj
is AUX ccomp
long ADJ acomp
and CCONJ cc
straight ADJ conj
, PUNCT punct
the DET det
abdomen NOUN nsubj
is AUX ccomp
slender ADJ acomp
, PUNCT punct
the DET det
neck NOUN nsubjpass
is AUX auxpass
constricted VERB ROOT
. PUNCT punct
lt PROPN nsubj
is AUX ccomp
fuzzy ADJ acomp
, PUNCT punct
like SCONJ prep
fat NOUN pobj
; PUNCT punct
winged VERB ROOT
. PUNCT punct
Its PRON poss
wings NOUN nsubj
are AUX ROOT
twofold ADJ acomp
. PUNCT punct
It PRON nsubj
has AUX ccomp
arms NOUN dobj
, PUNCT punct
it PRON nsubj
has AUX ccomp
legs NOUN dobj
, PUNCT punct
it PRON nsubj
has AUX ROOT
antennae NOUN dobj
. PUNCT punct
lt PROPN nsubj
is AUX ROOT
a DET det
flyer NOUN attr
, PUNCT punct
a DET det
constant ADJ amod
flyer NOUN appos
, PUNCT punct
a DET det
flutterer NOUN conj
, PUNCT punct
a DET det
sucker NOUN appos
of ADP prep
the DET det
different ADJ amod
flowers 

### How many terms?

In [12]:
len(re.findall('papalotl', butterflies))

6

In [13]:
len(re.findall(r'butterfl\w+', butterflies))

3

### Where are they?

In [14]:
from spacy.matcher import PhraseMatcher
from spacy import displacy

In [15]:
matcher = PhraseMatcher(nlp.vocab)
papalotl = nlp(butterflies)
phrase_list = ['butterfly', 'papalotl']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('mariposa', None, *phrase_patterns)
found_matches = matcher(papalotl)
print(found_matches)

[(4860986508236918519, 6, 7), (4860986508236918519, 134, 135), (4860986508236918519, 136, 137)]


In [16]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = papalotl[start:end]
    print(start, end, span.text)

6 7 butterfly
134 135 papalotl
136 137 butterfly


### Dependencies

In [17]:
doc = nlp(sentences[0])

In [18]:
displacy.render(doc, style='dep', jupyter=True)

### Part-of-speech tagging

In [19]:
for i in range(len(sentences)):
    doc = nlp(sentences[i])
    pos = [(w.text, w.pos_) for w in doc]
    for j in pos:
        if j[1] == 'ADJ':
            print(j[0])

long
straight
slender
fuzzy
twofold
constant
different
fuzzy
solid
many
large
yellow
yellow
fuzzy
black
beautiful
coveted
desirable
desirable
fragile
firm
similar
same
black
little
little
smoky
tawny
smoky
smoky
yellow
smoky
yellow
smoky
yellow
little
large
average
small
white
whitish
pale
yellow
similar
large
tiny
uniform
pale
pallid
yellow
livid
light
blue
light
blue
blue
large
small
Many
intricate
wonderful
intricate
like
average
beautiful
wonderful
amaranth
mature
ripe
yellow


In [20]:
for i in range(len(sentences)):
    doc = nlp(sentences[i])
    pos = [(w.text, w.pos_) for w in doc]
    for j in pos:
        if j[1] == 'VERB':
            print(j[0])

constricted
winged
trembles
beats
flies
sucks
sucks
comes
painted
varicolored
required
yellows
becomcs
painted
becomes
varicolored
breeds
becomes
develops
flies
flecked
varicolored
TLECOCOZPAPALOTL
called
fireyellow
glows
glistens
glows
glistens
becomes
turns
PAPALOTL
painted
sprinkled
flecked
painted
painted
comes
takes
becomes
varicolored
sought
sought
painted
called
become
named
become
