## Vocabulary abstraction

Here we want to make semantic trees for some of our categories

packages needed here : pandas nltk print-tree2

In [2]:
import os
import pandas as pd
import nltk
wn = nltk.corpus.wordnet
from digital_manuscript import BnF

Below a simple example of what wordnet is :
We have all the different meanings of the word 'dog' and we can get their definition

In [6]:
term = 'dog'
dog = wn.synsets(term, pos=wn.NOUN)
print(dog)
for meaning in dog :
    print(meaning.definition())

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01')]
a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
a dull unattractive unpleasant girl or woman
informal term for a man
someone who is morally reprehensible
a smooth-textured sausage of minced beef or pork usually smoked; often served on a bread roll
a hinged catch that fits into a notch of a ratchet to move a wheel forward or prevent it from moving backward
metal supports for logs in a fireplace


Here we create a class that will allow us to get the hypernym path either to animal or to concept

In [8]:
animal = wn.synset('animal.n.01')
artifact = wn.synset('artifact.n.01')

class r_node:
    def __init__(self, lemma, children=[]):
        self.lemma = lemma
        
#         if lemma == animal:
#             print(children + [lemma,])
        self.children = children
        self.hypernyms = lemma.hypernyms()
        self.parents = [r_node(h, children + [lemma,]) for h in self.hypernyms]
        
    def return_animal_path(self):
        if self.lemma == wn.synset('animal.n.01'):
            return [[self.lemma,],]
        else:
            parent_paths = [n.return_animal_path() for n in self.parents]
            parent_paths = [p for p in parent_paths if len(p) > 0]
            paths = []
            for p in parent_paths:
                paths += p
            for p in paths:
                p.append(self.lemma)
            return paths

    def return_concept_path(self):
        if self.lemma == wn.synset('entity.n.01'):
            return [[self.lemma,],]
        else:
            parent_paths = [n.return_concept_path() for n in self.parents]
            parent_paths = [p for p in parent_paths if len(p) > 0]
            paths = []
            for p in parent_paths:
                paths += p
            for p in paths:
                p.append(self.lemma)
            return paths

In [9]:
dog = wn.synsets('dog', pos=wn.NOUN)[0]
dn = r_node(dog)
dn.return_animal_path()
#weirdly dog has 2 path to animal

[[Synset('animal.n.01'),
  Synset('chordate.n.01'),
  Synset('vertebrate.n.01'),
  Synset('mammal.n.01'),
  Synset('placental.n.01'),
  Synset('carnivore.n.01'),
  Synset('canine.n.02'),
  Synset('dog.n.01')],
 [Synset('animal.n.01'), Synset('domestic_animal.n.01'), Synset('dog.n.01')]]

In [5]:
squirrel = wn.synsets('squirrel')[0]
sn = r_node(squirrel)
sn.return_animal_path()

[[Synset('animal.n.01'),
  Synset('chordate.n.01'),
  Synset('vertebrate.n.01'),
  Synset('mammal.n.01'),
  Synset('placental.n.01'),
  Synset('rodent.n.01'),
  Synset('squirrel.n.01')]]

In [6]:
fish = wn.synsets('fish', pos=wn.NOUN)[0]
fn = r_node(fish)
fn.return_animal_path()

[[Synset('animal.n.01'),
  Synset('chordate.n.01'),
  Synset('vertebrate.n.01'),
  Synset('aquatic_vertebrate.n.01'),
  Synset('fish.n.01')]]

In [7]:
louse = wn.synsets('louse')[0]
ln = r_node(louse)
ln.return_animal_path()

[[Synset('animal.n.01'),
  Synset('invertebrate.n.01'),
  Synset('arthropod.n.01'),
  Synset('insect.n.01'),
  Synset('louse.n.01')]]

In [8]:
oyster = wn.synsets('oyster')[0]
on = r_node(oyster)
on.return_animal_path()

[[Synset('animal.n.01'),
  Synset('invertebrate.n.01'),
  Synset('mollusk.n.01'),
  Synset('bivalve.n.01'),
  Synset('oyster.n.01')]]

In [9]:
squirrel = wn.synsets('squirrel')[0]
weasel = wn.synsets('weasel')[1] # index 0 is a sneaky person
squirrel.lowest_common_hypernyms(weasel)

snake = wn.synsets('snake')[0]
print(snake.lowest_common_hypernyms(weasel))
print(snake.lowest_common_hypernyms(squirrel))

turtle = wn.synsets('turtle')[1] # index 0 is a turtleneck (lmao)
print(turtle.lowest_common_hypernyms(snake))
print(turtle.lowest_common_hypernyms(squirrel))

fish = wn.synsets('fish', pos=wn.NOUN)[0] # pos = part of speech
swan = wn.synsets('swan')[0]
print(fish.lowest_common_hypernyms(turtle))
print(fish.lowest_common_hypernyms(snake))

louse = wn.synsets('louse')[0]
oyster = wn.synsets('oyster')[0]
print(louse.lowest_common_hypernyms(snake))
print(louse.lowest_common_hypernyms(oyster))

[Synset('vertebrate.n.01')]
[Synset('vertebrate.n.01')]
[Synset('reptile.n.01')]
[Synset('vertebrate.n.01')]
[Synset('vertebrate.n.01')]
[Synset('vertebrate.n.01')]
[Synset('animal.n.01')]
[Synset('invertebrate.n.01')]


## Animal tree

this first cell was a sort of initialization with the meaning errors I found myself

Then this was put in a csv that is accessible in my fork

In [10]:
path = os.getcwd() + '/../manuscript-object/thesaurus/animal.csv'

df = pd.read_csv(path)
terms = list(set(df['prefLabel_en']))
simple_terms = [t for t in terms if ' ' not in t]
simple_terms


for i in range(len(simple_terms)) :
    if simple_terms[i] in ['weasel','turtle','cuckoo','codfish','mussel','chicken','goldfinch','turtledofe','linnet', 'water-dog']: #first meaning is not the animal. Birds tend to have another bird as first meaning
        if simple_terms[i]=='turtledofe':
            simple_terms[i]='turtledove'
        elif simple_terms[i]=='water-dog':
            simple_terms[i]='water_dog'
        lit = [simple_terms[i],1]
    elif simple_terms[i] in ['ewe','swallow','crayfish','partridge','hart',"calendra",'calandra','hog']:#have to go to third meaning to get the animal or the right bird
        if simple_terms[i] in ["calendra",'calandra']:
            simple_terms[i]='lark'
        lit = [simple_terms[i],2]
    elif simple_terms[i] in ['dragon']:#the mythical creature does not have a link to animal. This is "any of several small tropical Asian lizards capable of gliding by spreading winglike membranes on each side of the body"
        lit = [simple_terms[i],3]
    else :
        lit = [simple_terms[i],0]
    simple_terms[i]=lit

for i in range(len(simple_terms)):
    word=simple_terms[i][0]
    liste = wn.synsets(word,pos=wn.NOUN)
    for item in liste:
        ani = r_node(item)
        if ani.return_animal_path()!=[]:
            simple_terms[i].append(item.definition())
        else : 
            simple_terms[i].append(item.definition() + " (not in tree)")

This is to reinitialize the csv, so don't

In [11]:
df = pd.DataFrame(simple_terms)
#df.to_csv('./simple_terms.csv',sep=',',index=False)

In [12]:
class Node(object):

    def __init__(self, value, parent):
        self.value = value
        self.children = []
        if parent is not None:
            parent.children.append(self)

In [13]:
from print_tree import print_tree   #actually using package print-tree2 from pypi

class print_custom_tree(print_tree):

    def get_children(self, node):
        return node.children

    def get_node_str(self, node):
        return str(node.value)

This is the cell that actually prints the tree. It uses the csv, so if someone changes the chosen meaning in the csv it changes the tree

In [14]:
animaln = Node('animal', None)
visited = []
skipped=0
path = os.getcwd() + '/../manuscript-object/simple_terms.csv'
words = pd.read_csv(path)

for animals in words.itertuples() :
    if animals[1] not in ['bombicum','og','verdaule','tellin','petit-gri','aucupio','daot','shell','mutton','barbel','pork']:   #not in wordnet or not in tree
        ani = wn.synsets(animals[1])[animals[2]]
        an = r_node(ani)
        lis = an.return_animal_path()[0]
        for i in range (1,len(lis)):
            name = lis[i].name().split('.')[0]
            if name in ['even-toed_ungulate','odd-toed_ungulate','orthopterous_insect','chordate','placental','ungulate','thrush','oscine','leporid','lagomorph','decapod_crustacean','chordate','diapsid','anapsid','chelonian','bovid','bovine',"anseriform_bird",'wading_bird','columbiform_bird','corvine_bird','cuculiform_bird','aquatic_vertebrate','bony_fish','teleost_fish','gadoid','ganoid','annelid','oligochaete','decapod','soft-finned_fish','phasianid','ambystomid','red_deer','hominid','agamid', 'sporting_dog']: # - makes everything go boom also things I don't care about or that nobody knows
                skipped+=1
                visited.append(name)
            elif name not in visited :
                name_prec = lis[i-1-skipped].name().split('.')[0]
                skipped=0
                string = name+'n'+' = Node(name,'+ name_prec+'n)'
                exec(string)
                visited.append(name)
            else :
                skipped=0

print_custom_tree(animaln) 


                                  ┌musteline_mammal─weasel
                                  │      ┌wolf
                        ┌carnivore┼canine┤
                        │         │      └dog─hunting_dog─water_dog
                        │         └feline─cat
                        │      ┌squirrel
                        │      ├dormouse
                        ├rodent┼mouse
                        │      ├rat
                        │      └porcupine
                        ├swine─hog
                        │      ┌horse
                        ├equine┤
                        │      └mule
                 ┌mammal┤
                 │      ├primate─homo
                 │      │        ┌sheep─ewe
                 │      │        │      ┌cow
                 │      │        ├cattle┼beef
                 │      │        │      └ox
                 │      ├ruminant┤
                 │      │        ├goat
                 │      │        └deer─hart
                 │      ├bat
     

<__main__.print_custom_tree at 0x2ada78fd6c8>

## Material Tree

Here it's exactly the same except we are going up to entity instead of just animal

In [24]:
path = os.getcwd() + '/../manuscript-object/thesaurus/material.csv'

df = pd.read_csv(path)
terms = list(set(df['prefLabel_en']))
terms.pop(0)
terms.pop(0)
simple_terms = [t for t in terms if ' ' not in t]


for i in range(len(simple_terms)):
    if simple_terms[i] in ['earth','soil','lead','filing','spat','ocher','perfume','mold','coral','germ','phlegm','foam','or','ebony','amber','carton','blue','egg']:
        simple_terms[i]=[simple_terms[i],1]
    elif simple_terms[i] in ['pitch','distemper','gold']:
        simple_terms[i]=[simple_terms[i],2]
    elif simple_terms[i] in ['scale']:
        simple_terms[i]=[simple_terms[i],6]
    elif simple_terms[i]=='clofe':
        simple_terms[i]='clove'
        simple_terms[i]=[simple_terms[i],3]
    elif simple_terms[i] in ['black']:
        simple_terms[i]=[simple_terms[i],4]
    else :
        if simple_terms[i] in ['canva','antho','ambergri','verdigri']:
            simple_terms[i]=simple_terms[i]+'s'
        elif simple_terms[i] == 'vernice':
            simple_terms[i]='varnish'
        elif simple_terms[i] == 'salpeter':
            simple_terms[i]='saltpeter'
        elif simple_terms[i] == 'preserf':
            simple_terms[i]='preserves'
        elif simple_terms[i]=='pumouse':
            simple_terms[i]='pumice'
        simple_terms[i]=[simple_terms[i],0]

for i in range(len(simple_terms)):
    word=simple_terms[i][0]
    liste = wn.synsets(word,pos=wn.NOUN)
    for item in liste:
        ani = r_node(item)
        if ani.return_concept_path()!=[]:
            simple_terms[i].append(item.definition())
        else : 
            simple_terms[i].append(item.definition() + " (not an entity)")

In [None]:
df = pd.DataFrame(simple_terms)
#df.to_csv('./Tree/material_terms.csv',sep=',',index=False)

This tree looks better if printed from an actual jupyter notebook browser page (you're probably already there but you know just saying)

In [25]:
entityn = Node('entity', None)
visited = []
skipped=0

path = os.getcwd() + '/../manuscript-object/Tree/material_terms.csv'
words = pd.read_csv(path)


for material in words.itertuples() :
    if material[1] not in ['☉','☾','☼','☀','glueing','coryal','incarnadine','flin','spalt','grai','aquaforti','colla','vetro', 'goumiche','arene', 'theriac', 'bituman', 'aceto', 'wooden','zedoary','stagno','basan','rowan,','anthos','☿','waxed','toadstone','glair', 'sandiver','fustet','potin','dyeing','taffetum','bullitoyre','ferro','muddying','ferlin','leaded','salted','lucertum','soldered','glued','rooftile','earthen','milla','metalline','stavesacre','bullitoire','couleur','arrabeic','charcoaled','cendrée','cristallin','porfidio','ferri','smoked','maplewood','arène','brouillamini','whitening','real','k','♀','bronzing','thunderstone','litharge','purpurina', 'salty','felin','tinned','prele','crocum','tint','silvered','fresil','verdet','dyed','oiled','horsedung','cocon.','billon','tuf','vermiculari','pulverin','unleaded','solle','felinder','coppera','quarton','batture','eau-de-vie','florey','taffer','wallwort','enameled','ferret-silk','ardide','porkfat','sanguine','magistra','vermeille','cornaline','niello','enamelling','cocon','aspalt','damascening','melli','grè','rocaille','crêpe','aurea','aspalte','solfo','chimolée','enamelled','sap-green','rutum','trwood','stuf','leady','debri','luted','galipot','vermillion','enameling','pearled','persicaire','enilanroc','oiling','otnegra','marcasite','oily','verjuice','gilded','verd','asphaltum','regulu']:   #not in wordnet or not in entity tree
        mater = wn.synsets(material[1])[material[2]]
        mat = r_node(mater)
        lis = mat.return_concept_path()[0]
        for i in range (1,len(lis)):
            name = lis[i].name().split('.')[0]
            if name in ['even-toed_ungulate','angiospermous_tree', 'high-angle_gun','copper-base_alloy','ovum',"dyer's_rocket",'edible_fat','heavier-than-air_craft','reproductive_structure','legging','consumer_goods','ceramic_ware','source_of_illumination','baseball_equipment','hospital_room','sheet_metal','vascular_plant','vertebrate','mammal','placental','plant_product','animal_tissue']: # - makes everything go boom also things I don't care about
                skipped+=1
                visited.append(name)
            elif name not in visited :
                name_prec = lis[i-1-skipped].name().split('.')[0]
                skipped=0
                string = name+'n'+' = Node(name,'+ name_prec+'n)'
                exec(string)
                visited.append(name)
            else :
                skipped=0

print_custom_tree(entityn)


                                                      ┌egg
                                                      ├flour
                                            ┌foodstuff┼concoction─dough
                                            │         │                              ┌sugar
                                            │         │                   ┌sweetening┼honey
                                            │         │                   │          └syrup
                                            │         │          ┌flavorer┼spice─clove
                                            │         │          │        └condiment─vinegar
                                            │         ├ingredient┤
                                            │         │          └egg_yolk
                                            │         │             ┌cheese
                                            │         └dairy_product┤
                                            │                  

<__main__.print_custom_tree at 0x2ada78fdec8>

## Tool tree

more of the same

In [26]:
path = os.getcwd() + '/../manuscript-object/thesaurus/tool.csv'

df = pd.read_csv(path)
terms = list(set(df['prefLabel_en']))
terms.pop(0)
simple_terms = [t for t in terms if ' ' not in t]

for i in range(len(simple_terms)):
    if simple_terms[i] in ['canva']:
        simple_terms[i] = simple_terms[i] +'s'
    elif simple_terms[i] == 'glofe':
        simple_terms[i]='glove'
    elif simple_terms[i] == 'siefe':
        simple_terms[i]='sieve'
    elif simple_terms[i] == 'ditto':
        simple_terms[i] = 'finger'
    elif simple_terms[i] == 'bellow':
        simple_terms[i] = 'bellows'
    if simple_terms[i] in ['brush','table','gimlet','ruler','beater','screw','vat','glove','frail','auger','cast']:
        simple_terms[i]=[simple_terms[i],1]
    elif simple_terms[i] in ['board','saw','rake']:
        simple_terms[i]=[simple_terms[i],2]
    elif simple_terms[i] in ['file','bore']:
        simple_terms[i]=[simple_terms[i],3]
    elif simple_terms[i] in ['stake']:
        simple_terms[i]=[simple_terms[i],4]
    elif simple_terms[i] in ['matrix','bolt']:
        simple_terms[i]=[simple_terms[i],5]
    elif simple_terms[i] in ['press']:
        simple_terms[i]=[simple_terms[i],6]
    else :
        simple_terms[i]=[simple_terms[i],0]

for i in range(len(simple_terms)):
    word=simple_terms[i][0]
    liste = wn.synsets(word,pos=wn.NOUN)
    for item in liste:
        ani = r_node(item)
        if ani.return_concept_path()!=[]:
            simple_terms[i].append(item.definition())
        else : 
            simple_terms[i].append(item.definition() + " (not an entity)")

In [None]:
df = pd.DataFrame(simple_terms)
#df.to_csv('./Tree/tool_terms.csv',sep=',',index=False)

In [27]:
entityn = Node('entity', None)
visited = []
skipped=0

path = os.getcwd() + '/../manuscript-object/Tree/tool_terms.csv'
words = pd.read_csv(path)


for tool in words.itertuples() :
    if tool[1] not in ['filed','cannule','hammered','molded','chafing-dish','felin','luted','burnisher','estamiere','pestled','pestling','bullitoire','desgrusouer','clamped','muid','lime-twig','chaple','puncheon','cochiaro','desgrusoue','quarton','cutting-punch','scratch-brush','prele','blast-pipe','esgrusouer','cendrée','fustée','brushed','wirebrush','moulet','sieved','chiseling','fournaise','arene','mattra','cushionet','chasing','fire-steel','grai','well-forged','desramonet','burnished','nailed','semal','stoppered','flin','onglet','fornaise','matrass','chameau','scratch-brushed','underfoot','scissor','trusseaulx','bedsheet','thunderstone','sift','gratteau','ditch-spade','grateau','arson']:   #not in wordnet or not in entity tree
        too = wn.synsets(tool[1])[tool[2]]
        to = r_node(too)
        lis = to.return_concept_path()[0]
        for i in range (1,len(lis)):
            name = lis[i].name().split('.')[0]
            if name in ['even-toed_ungulate','ovum',"dyer's_rocket",'heavier-than-air_craft','reproductive_structure','legging','consumer_goods','ceramic_ware','source_of_illumination','baseball_equipment','hospital_room','sheet_metal','vascular_plant','plant_product','animal_tissue','bone-ash_cup',"plumber's_snake",'high-angle_gun','show-stopper','natural_phenomenon','physical_phenomenon','fruit_tree','external_body_part','kitchen_utensil','cutting_implement','timepiece']: #- make everything go boom also things I don't care about
                skipped+=1
                visited.append(name)
            elif name not in visited :
                name_prec = lis[i-1-skipped].name().split('.')[0]
                skipped=0
                string = name+'n'+' = Node(name,'+ name_prec+'n)'
                exec(string)
                visited.append(name)
            else :
                skipped=0

print_custom_tree(entityn)


                                                      ┌drygoods─white_goods─linen─table_linen─napkin
                                                      │        ┌headdress─hat
                                            ┌commodity┼clothing┤
                                            │         │        └handwear─glove
                                            │         └durables─appliance─home_appliance─kitchen_appliance─oven
                                            │                             ┌ramrod
                                            │                         ┌rod┤
                                            │                         │   └baton
                                            │                         │    ┌rake
                                            │                         │    │                ┌chisel─burin
                                            │                         │    ├cutter─edge_tool┤
                                            

<__main__.print_custom_tree at 0x2ada7acee08>

## Body part Tree

In [28]:
path = os.getcwd() + '/../manuscript-object/thesaurus/body_part.csv'

df = pd.read_csv(path)
terms = list(set(df['prefLabel_en']))
terms.pop(0)
simple_terms = [t for t in terms if ' ' not in t]

for i in range(len(simple_terms)):
    if simple_terms[i] in ['temple','phlegm','teeth']:
        simple_terms[i]=[simple_terms[i],1]
    elif simple_terms[i] in ['matrix']:
        simple_terms[i]=[simple_terms[i],2]
    elif simple_terms[i] in []:
        simple_terms[i]=[simple_terms[i],3]
    elif simple_terms[i] in []:
        simple_terms[i]=[simple_terms[i],4]
    else :
        if simple_terms[i] == 'ditto':
            simple_terms[i]='finger'
        simple_terms[i]=[simple_terms[i],0]

for i in range(len(simple_terms)):
    word=simple_terms[i][0]
    liste = wn.synsets(word,pos=wn.NOUN)
    for item in liste:
        ani = r_node(item)
        if ani.return_concept_path()!=[]:
            simple_terms[i].append(item.definition())
        else : 
            simple_terms[i].append(item.definition() + " (not an entity)")

In [None]:
df = pd.DataFrame(simple_terms)
#df.to_csv('./Tree/body_part_terms.csv',sep=',',index=False)

In [29]:
entityn = Node('entity', None)
visited = []
skipped=0

path = os.getcwd() + '/../manuscript-object/Tree/body_part_terms.csv'
words = pd.read_csv(path)


for body in words.itertuples() :
    if body[1] not in ['digiti','underfoot','breathe','naribus','manu']:   #not in wordnet or not in entity tree
        bod = wn.synsets(body[1])[body[2]]
        bo = r_node(bod)
        lis = bo.return_concept_path()[0]
        for i in range (1,len(lis)):
            name = lis[i].name().split('.')[0]
            if name in ['part','synovial_joint']: #- make everything go boom also things I don't care about
                skipped+=1
                visited.append(name)
            elif name not in visited :
                name_prec = lis[i-1-skipped].name().split('.')[0]
                skipped=0
                string = name+'n'+' = Node(name,'+ name_prec+'n)'
                exec(string)
                visited.append(name)
            else :
                skipped=0

print_custom_tree(entityn)


                                                                                      ┌saliva
                                                                            ┌secretion┤
                                                                            │         └mucus─phlegm
                                       ┌body_substance─liquid_body_substance┤
                                       │                                    └blood
                      ┌matter─substance┤
                      │                │        ┌waste─body_waste─urine
                      │                └material┤
                      │                         └mineral─ore
                      │                                 ┌elbow
                      │               ┌joint─hinge_joint┤
                      │               │                 └knee
                      │               │                         ┌rima─mouth
                      │               │         ┌passage─orifice┤
       

<__main__.print_custom_tree at 0x2ada83c3c08>