# Vocabulary
Vocabulary is an input data object for SpanTagger, PhraseTagger, RegexTagger.

In [1]:
vocabulary_file='span_vocabulary.csv'

with open(vocabulary_file) as f:
    print(f.read())

_token_,value,_priority_
string,string,callable
tundma,T,1
pĆ¤ike,P,2
inimene,K,2
inimene,I,3



In [2]:
from estnltk.taggers import Vocabulary

vocabulary = Vocabulary.read_csv(vocabulary_file=vocabulary_file,
                                 key = '_token_',
                                 default_rec = {'default': 'default_value'}
                                 )
vocabulary

_token_,value,_priority_,default
inimene,K,2,default_value
,I,3,default_value
päike,P,2,default_value
tundma,T,1,default_value


Internal vocabulary format is a dict:

In [3]:
# NBVAL_IGNORE_OUTPUT
vocabulary.to_dict()

{'tundma': [{'default': 'default_value',
   '_token_': 'tundma',
   'value': 'T',
   '_priority_': 1}],
 'päike': [{'default': 'default_value',
   '_token_': 'päike',
   'value': 'P',
   '_priority_': 2}],
 'inimene': [{'default': 'default_value',
   '_token_': 'inimene',
   'value': 'K',
   '_priority_': 2},
  {'default': 'default_value',
   '_token_': 'inimene',
   'value': 'I',
   '_priority_': 3}]}

In [4]:
# NBVAL_IGNORE_OUTPUT
records = vocabulary.to_records()
records

[{'default': 'default_value',
  '_token_': 'inimene',
  'value': 'K',
  '_priority_': 2},
 {'default': 'default_value',
  '_token_': 'inimene',
  'value': 'I',
  '_priority_': 3},
 {'default': 'default_value',
  '_token_': 'päike',
  'value': 'P',
  '_priority_': 2},
 {'default': 'default_value',
  '_token_': 'tundma',
  'value': 'T',
  '_priority_': 1}]

The following colors codes are in use for value types.

In [5]:
import regex as re

voc = Vocabulary.from_records(key='color', attributes=('color', 'value type', 'example'),
                              records=[ 
                                       {'example': 'string', 'value type': 'str', 'color':'LightSteelBlue'},
                                       {'example': 12345, 'value type': 'int', 'color':'Moccasin'},
                                       {'example': True, 'value type': 'int', 'color':'Moccasin'},
                                       {'example': 0.123, 'value type': 'float', 'color':'Cyan'},
                                       {'example': ('pi', 3, .14), 'value type': 'tuple', 'color':'LightPink'},
                                       {'example': list('LIST'), 'value type': 'list', 'color':'OrangeRed'},
                                       {'example': re.compile('pattern'), 'value type': 're.Pattern', 'color':'Yellow'},
                                       {'example': set('set'), 'value type': 'other', 'color':'White'},
                                       ])
voc

color,value type,example
Cyan,float,0.123
LightPink,tuple,"('pi', '3', '0.14')"
LightSteelBlue,str,string
Moccasin,int,12345
,int,True
OrangeRed,list,"['L', 'I', 'S', 'T']"
White,other,"{'t', 's', 'e'}"
Yellow,re.Pattern,<Regex pattern>


Internally `Vocabulary` is a `Mapping[Hashable, List[Mapping[Hashable, Any]]]`

In [6]:
voc.mapping

{'LightSteelBlue': [{'example': 'string',
   'value type': 'str',
   'color': 'LightSteelBlue'}],
 'Moccasin': [{'example': 12345, 'value type': 'int', 'color': 'Moccasin'},
  {'example': True, 'value type': 'int', 'color': 'Moccasin'}],
 'Cyan': [{'example': 0.123, 'value type': 'float', 'color': 'Cyan'}],
 'LightPink': [{'example': ('pi', 3, 0.14),
   'value type': 'tuple',
   'color': 'LightPink'}],
 'OrangeRed': [{'example': ['L', 'I', 'S', 'T'],
   'value type': 'list',
   'color': 'OrangeRed'}],
 'Yellow': [{'example': regex.Regex('pattern', flags=regex.V0),
   'value type': 're.Pattern',
   'color': 'Yellow'}],
 'White': [{'example': {'e', 's', 't'},
   'value type': 'other',
   'color': 'White'}]}

`to_lower` method creates a new `Vocabulary` with lowercase `key` values. This method raises an exception if all the keys can't be lowercased or there are different keys that are lowercase equal.

In [7]:
voc_low = voc.to_lower()
voc_low

color,value type,example
cyan,float,0.123
lightpink,tuple,"('pi', '3', '0.14')"
lightsteelblue,str,string
moccasin,int,12345
,int,True
orangered,list,"['L', 'I', 'S', 'T']"
white,other,"{'t', 's', 'e'}"
yellow,re.Pattern,<Regex pattern>
