# The butterflies of the Florentine Codex

## Importing packages

In [1]:
import pandas as pd
import re
import spacy

from spacy.matcher import PhraseMatcher
from spacy import displacy

from spacy.tokens import Span

## Sentences with spaCy

In [2]:
papalotl = open('papalotl.txt')
butterflies = papalotl.read()

In [3]:
from spacy.lang.en import English

raw_text = butterflies
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]

In [4]:
sentences[:10]

['BUTTERFLY\nWhatever the kind of butterfly, it is long and straight, the abdomen is slender, the neck is constricted.',
 'lt is fuzzy, like fat; winged.',
 'Its wings are twofold.',
 'It has arms, it has legs, it has antennae.',
 'lt is a flyer, a constant flyer, a flutterer, a sucker of the different flowers, and a sucker of liquid.',
 'It is fuzzy.',
 'It trembles, it beats its wings together, it constantly flies.',
 'It sucks, it sucks liquid.',
 'It is not solid.',
 'There are many kinds of butterflies.']

In [5]:
# Get rid of newlines
sentences = [item.replace('\n', " ") for item in sentences]

## Creating dataframe

In [6]:
df = pd.DataFrame(sentences) 
df.columns = ['BUTTERFLY']
df[:10]

Unnamed: 0,BUTTERFLY
0,"BUTTERFLY Whatever the kind of butterfly, it i..."
1,"lt is fuzzy, like fat; winged."
2,Its wings are twofold.
3,"It has arms, it has legs, it has antennae."
4,"lt is a flyer, a constant flyer, a flutterer, ..."
5,It is fuzzy.
6,"It trembles, it beats its wings together, it c..."
7,"It sucks, it sucks liquid."
8,It is not solid.
9,There are many kinds of butterflies.


## Tokenization

In [7]:
# Loading library
nlp = spacy.load('en_core_web_lg')

In [8]:
doc = nlp(butterflies)

In [9]:
for token in doc:
    print(token, token.pos_, token.dep_, token.lemma_)

BUTTERFLY PROPN ROOT BUTTERFLY

 SPACE  

Whatever DET advcl whatever
the DET det the
kind NOUN dobj kind
of ADP prep of
butterfly NOUN pobj butterfly
, PUNCT punct ,
it PRON nsubj -PRON-
is AUX ROOT be
long ADJ acomp long
and CCONJ cc and
straight ADJ conj straight
, PUNCT punct ,
the DET det the
abdomen NOUN nsubj abdomen
is AUX ccomp be
slender ADJ acomp slender
, PUNCT punct ,
the DET det the
neck NOUN nsubjpass neck
is AUX auxpass be
constricted ADJ ROOT constricted
. PUNCT punct .
lt PROPN nsubj lt
is AUX ccomp be
fuzzy ADJ acomp fuzzy
, PUNCT punct ,
like SCONJ prep like
fat NOUN pobj fat
; PUNCT punct ;
winged ADJ ROOT winged
. PUNCT punct .
Its DET poss -PRON-
wings NOUN nsubj wing
are AUX ROOT be
twofold ADJ acomp twofold
. PUNCT punct .
It PRON nsubj -PRON-
has AUX ccomp have
arms NOUN dobj arm
, PUNCT punct ,
it PRON nsubj -PRON-
has AUX ccomp have
legs NOUN dobj leg
, PUNCT punct ,
it PRON nsubj -PRON-
has AUX ROOT have
antennae NOUN dobj antennae
. PUNCT punct .
lt PROPN 

design NOUN pobj design
, PUNCT punct ,
and CCONJ cc and
truly ADV advmod truly
sought VERB conj seek
after ADP prep after
truly ADV advmod truly
wonderful ADJ amod wonderful
. PUNCT punct .
They PRON nsubj -PRON-
are AUX ROOT be
o ADV intj o
intricate ADJ amod intricate
design NOUN attr design
, PUNCT punct ,
sought VERB conj seek
after ADP prep after
, PUNCT punct ,
flower NOUN npadvmod flower
- PUNCT punct -
like ADJ conj like
. PUNCT punct .


 SPACE  


UAPPAPALOTL PROPN ROOT UAPPAPALOTL

 SPACE  

It PRON nsubj -PRON-
is AUX ccomp be
of ADP prep of
average ADJ amod average
size NOUN pobj size
; PUNCT punct ;
its DET poss -PRON-
wings NOUN nsubjpass wing
are AUX auxpass be
painted VERB ROOT paint
chili NOUN npadvmod chili
- PUNCT punct -
red NOUN oprd red
. PUNCT punct .
It PRON nsubj -PRON-
is AUX ROOT be
also ADV advmod also
beautiful ADJ acomp beautiful
, PUNCT punct ,
also ADV advmod also
wonderful ADJ acomp wonderful
. PUNCT punct .
Amaranth NOUN compound amaranth
leaves VERB

## Showing dependencies

In [10]:
doc = nlp(sentences[0])

In [11]:
displacy.render(doc, style='dep', jupyter=True)

## Part-of-speech tagging

In [12]:
for i in range(len(sentences)):
    doc = nlp(sentences[i])
    pos = [(w.text, w.pos_) for w in doc]
    for j in pos:
        if j[1] == 'VERB':
            print(j[0])

trembles
beats
flies
sucks
sucks
comes
painted
varicolored
required
yellows
painted
becomes
breeds
becomes
develops
flies
flecked
varicolored
called
glows
glistens
glows
glistens
glistens
becomes
turns
painted
sprinkled
flecked
painted
painted
comes
takes
becomes
varicolored
sought
sought
painted
leaves
called
leaves
become
named
become


In [13]:
for i in range(len(sentences)):
    doc = nlp(sentences[i])
    pos = [(w.text, w.pos_) for w in doc]
    for j in pos:
        if j[1] == 'ADJ':
            print(j[0])

long
straight
slender
constricted
fuzzy
winged
twofold
constant
different
fuzzy
liquid
solid
many
large
yellow
yellow
fuzzy
black
beautiful
coveted
desirable
desirable
fragile
varicolored
firm
similar
same
black
little
little
fiery
smoky
tawny
tawny
smoky
smoky
smoky
smoky
little
large
average
small
white
whitish
pale
yellow
similar
large
tiny
uniform
texotli
light
pale
pallid
yellow
livid
light
blue
light
blue
blue
brown
large
small
Many
intricate
wonderful
intricate
like
average
beautiful
wonderful
Amaranth
mature
ripe
yellow


## How many terms?

In [14]:
len(re.findall('papalotl', butterflies))

6

In [15]:
len(re.findall(r'butterfl\w+', butterflies))

3

## Where are they?

### The words "butterfly" and "papalotl"

In [16]:
matcher = PhraseMatcher(nlp.vocab)
papalotl = nlp(butterflies)
phrase_list = ['butterfly', 'papalotl']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('mariposa', None, *phrase_patterns)
found_matches = matcher(papalotl)
print(found_matches)

[(4860986508236918519, 6, 7), (4860986508236918519, 134, 135), (4860986508236918519, 136, 137)]


In [17]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = papalotl[start:end]
    print(start, end, span.text)

6 7 butterfly
134 135 papalotl
136 137 butterfly


### Looking for the parts of the butterfly

In [18]:
doc = nlp(butterflies)

In [19]:
matcher = PhraseMatcher(nlp.vocab)
papalotl = nlp(butterflies)
phrase_list = ['abdomen', 'neck', 'wings', 'arms', 'legs', 'antennae']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('mariposa', None, *phrase_patterns)
found_matches = matcher(papalotl)

In [20]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = papalotl[start:end]
    print(start, end, span.text)

15 16 abdomen
20 21 neck
34 35 wings
40 41 arms
44 45 legs
48 49 antennae
85 86 wings
234 235 wings
369 370 wings
516 517 wings


## Adding the body parts of the butterfly as new entities

In [21]:
def show_ents(doc):
    if doc.ents: 
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_)
    else:
        print('No entity found')

In [22]:
ORGAN = doc.vocab.strings['ORGAN']

In [23]:
new_ent = Span(doc, 15, 16, label=ORGAN)
new_ent1 = Span(doc, 20, 21, label=ORGAN) 
new_ent2 = Span(doc, 34, 35, label=ORGAN) 
new_ent3 = Span(doc, 40, 41, label=ORGAN) 
new_ent4 = Span(doc, 44, 45, label=ORGAN) 
new_ent5 = Span(doc, 48, 49, label=ORGAN) 
new_ent6 = Span(doc, 85, 86, label=ORGAN) 
new_ent7 = Span(doc, 234, 235, label=ORGAN) 
new_ent8 = Span(doc, 369, 370, label=ORGAN) 
new_ent9 = Span(doc, 516, 517, label=ORGAN) 

In [24]:
doc.ents = list(doc.ents)+[new_ent]+[new_ent1]+[new_ent2]+[new_ent3]+[new_ent4]+[new_ent5]+[new_ent6]+[new_ent7]+[new_ent8]+[new_ent9]

In [25]:
show_ents(doc)

abdomen - ORGAN
neck - ORGAN
wings - ORGAN
arms - ORGAN
legs - ORGAN
antennae - ORGAN
wings - ORGAN
XICALPAPALOTL - ORG
XICALTECONPAPALOTL - ORG
XICALTECON - PERSON
xicalli - PERSON
TLILPAPALOTL - PERSON
wings - ORGAN
TLECOCOZPAPALOTL - ORG
quappachpapalotl - PERSON
lts - PERSON
IZTAC PAPALOTL
 - PRODUCT
CHIAN PAPALOTL - PRODUCT
wings - ORGAN
TEXOPAPALOTL
 - LAW
XOCHIPAPALOTL - PERSON
UAPPAPALOTL - PERSON
wings - ORGAN


## Highlighting the body parts of the butterfly

In [26]:
colors = {'ORGAjN': 'purple'}
options = {'ents': ['ORGAN'], 'colors': colors}

In [27]:
displacy.render(doc, style='ent', options=options)