# Categorizing Part of Speech Tags for BDB

The categories are:
1. agent
2. object
3. place

For this task, I used 1) [openscripture's BDB text](https://github.com/openscriptures/HebrewLexicon). For this process, I wrote an interactive script to iterate through all of the pos tags, present me with samples of the lexemes, and add them to a dictionary. Many of the pos tags are simply too vague to mine any category data from (for example: `n.m.pl.`). For pos tags that contained less than 5 lexemes, I frequently looked up the definitions in BDB. Even if the pos tag was vague, if the handful of its members neatly fell into a category, I kept them. The results are written to 2 csv files, one which contains categorised pos tags, and the other which contains the rejected pos tags (for potential processing later on). 

In [48]:
from lxml import etree

tree = etree.parse("../github/textfabric_notebooks/BrownDriverBriggs.xml")
root = tree.getroot()
namespace = {'None':'http://openscriptures.github.com/morphhb/namespace'}

In [51]:
pos_inventory = set()

for entry in root.findall('None:part/None:section/None:entry/None:pos', namespace):
    pos = entry.text
    if pos.split('.')[0] == 'n':
        pos_inventory.add(pos)
        
for pos in pos_inventory:
    if 'loc' in pos:
        print(pos)

n.pr.loc


In [133]:
pos_inventory

{'n',
 'n. [m.]',
 'n. abstr',
 'n. f',
 'n. gent',
 'n. m',
 'n.[f.]',
 'n.[f.]pl',
 'n.[f.pl.]',
 'n.[m',
 'n.[m.,f.]',
 'n.[m.?]',
 'n.[m.]',
 'n.[m.]coll',
 'n.[m.]du',
 'n.[m.]intens',
 'n.[m.]pl',
 'n.[m.]pl.abstr',
 'n.[m.]pl.intens',
 'n.coll',
 'n.coll.f',
 'n.f',
 'n.f.abstr',
 'n.f.abstr.pl',
 'n.f.coll',
 'n.f.cstr',
 'n.f.denom',
 'n.f.du',
 'n.f.emph',
 'n.f.pl',
 'n.f.pl.intens',
 'n.f.pr.putei',
 'n.f.unit',
 'n.f.verbal',
 'n.gent',
 'n.gent.coll',
 'n.gent.pl',
 'n.indecl',
 'n.m',
 'n.m.',
 'n.m.[pl.]',
 'n.m.coll',
 'n.m.dei',
 'n.m.denom',
 'n.m.du',
 'n.m.emph',
 'n.m.epith',
 'n.m.pers',
 'n.m.pl',
 'n.m.pl.abstr',
 'n.m.pl.emph',
 'n.pl',
 'n.pl.[m.]',
 'n.pl.[m.] abstr',
 'n.pl.abstr',
 'n.pl.f.abstr',
 'n.pl.gent',
 'n.pl.indecl',
 'n.pl.m',
 'n.pr',
 'n.pr.',
 'n.pr.[m.]',
 'n.pr.deae',
 'n.pr.dei',
 'n.pr.div',
 'n.pr.divin',
 'n.pr.f',
 'n.pr.fl',
 'n.pr.flum',
 'n.pr.font',
 'n.pr.gent',
 'n.pr.gent.',
 'n.pr.gent.coll',
 'n.pr.gent.pl',
 'n.pr.loc',
 'n.p

In [96]:
def assign_pos(pos_set):
    pos_to_cat = {}
    no_addition = set()
    
    typ_long = {'ag':'agent','pl':'place','ob':'object'}
    
    for pos in pos_set:
        print(pos)
        
        forms = list()
        
        for entry in root.findall('None:part/None:section/None:entry/None:pos', namespace):
            cur_pos = entry.text
            if cur_pos == pos:
                parent = entry.getparent()
                text = parent.findall('None:w', namespace)[0]
                forms.append(text.text)         
                
        while True:
            cont = input('s/n/a?')
            if cont not in {'s','sa','n','a'}:
                print('typo...')
                continue
            if cont == 's': #sample
                print(forms[:5])
            elif cont == 'sa':
                print(forms)
            elif cont == 'n':  # no addition
                no_addition.add(pos)
                break
            elif cont == 'a': # add
                while True:
                    typ = input('type?')
                    if typ not in {'ag','pl','ob'}:
                        print('typo...')
                        continue
                    typ = typ_long[typ]
                    kind = input('kind?')
                    if kind == 'edit':
                        continue
                    pos_to_cat[pos] = {typ : kind}
                    break
                break
        
    return (pos_to_cat, no_addition)

In [98]:
pos_assignents = assign_pos(pos_inventory)

n.pl.m
s/n/a?s
['תַּחֲלֻאִים']
s/n/a?a
type?ob
kind?abstract
n.pr.pers.m
s/n/a?s
['חָם', 'כּוּשׁ']
s/n/a?a
type?ag
kind?person
n.coll.f
s/n/a?s
['צֹאן']
s/n/a?a
type?ag
kind?animal
n. m
s/n/a?s
['עָשִׁיר', 'יְתוּר']
s/n/a?a
type?ag
kind?person
n.pr.mont
s/n/a?s
['הֹר', 'חֹרֵב', 'חֶרְמוֹן', 'יְעָרִים', 'לְבָנוֹן']
s/n/a?a
type?pl
kind?name
n.m.pl
s/n/a?s
['אֲחַשְׁדַּרְפְּנִים', 'אֱלֹהִים', 'בִּעוּתִים', 'בַּרְקָנִים', 'בַּרְבֻּרִים']
s/n/a?n
n.[m.]coll
s/n/a?s
['אָזֵן', 'עֲדִי', 'פּוֹל', 'צֹרְפִי', 'שַׁ֫יִת']
s/n/a?sa
['אָזֵן', 'עֲדִי', 'פּוֹל', 'צֹרְפִי', 'שַׁ֫יִת']
s/n/a?a
type?ob
kind?physical
n.[m.]pl.intens
s/n/a?s
['שַׁעֲשֻׁעִים', 'רַחֲמִין']
s/n/a?a
type?ob
kind?abstract
n.pr.gent.coll
s/n/a?s
['עֲרָב']
s/n/a?a
type?ag
kind?person
n.f.denom
s/n/a?s
['אַרְעִי']
s/n/a?n
n.[f.pl.]
s/n/a?s
['רָאמוֹת']
s/n/a?a
type?ob
kind?physical
n.indecl
s/n/a?s
['שִׁשִּׁים', 'שְׁלֹשִׁים', 'תִּשְׁעִים', 'שִׁתִּין', 'תְּלָתִין']
s/n/a?sa
['שִׁשִּׁים', 'שְׁלֹשִׁים', 'תִּשְׁעִים', 'שִׁתִּין', 'תְּלָתִ

In [126]:
import csv

fieldnames = ['pos','type','of kind']

pos_data = list( {'pos': pos, 
                  'type': [x for x in typ.keys()][0], 
                  'of kind': [x for x in typ.values()][0]} \ # messy! It's late!
                for pos, typ in pos_assignents[0].items())

with open('BDB_pos_tags.csv','w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(pos_data)

In [132]:
# these tags are rejected because there is not enough information to assign a category to them

with open('BDB_rejected_posTags.csv','w') as file:
    writer = csv.writer(file)
    writer.writerows(list([pos] for pos in pos_assignents[1]))