In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
%matplotlib inline

In [8]:
import pandas as pd
import numpy as np

In [9]:
import bclm

In [10]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')

In [11]:
global_dropped = [spdf[spdf.sent_id==d].global_sent_id.iat[0] for d in dropped]
global_dropped

[5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]

In [35]:
spdf.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
form,עשרות,אנשים,מגיעים,מ,תאילנד
lemma,עשר,איש,הגיע,מ,תאילנד
upostag,CDT,NN,BN,PREPOSITION,NNP
xpostag,CDT,NN,BN,PREPOSITION,NNP
feats,gen=F|num=P,gen=M|num=P,gen=M|num=P|per=A|HebBinyan=HIFIL,_,_
token_id,1,2,3,4,4
sent_id,1,1,1,1,1
token_str,עשרות,אנשים,מגיעים,מתאילנד,מתאילנד
global_sent_id,1,1,1,1,1


In [12]:
uddf = bclm.read_dataframe('ud')

In [13]:
uddf = uddf[(~uddf.global_sent_id.isin(global_dropped))]

In [14]:
from collections import OrderedDict

In [15]:
uddf.head().T

Unnamed: 0,0,1,2,3,4
sent_id,1,1,1,1,1
id,1,2,3,4,5
form,עשרות,אנשים,מגיעים,מ,תאילנד
lemma,עשרות,איש,הגיע,מ,תאילנד
upostag,NUM,NOUN,VERB,ADP,PROPN
xpostag,NUM,NOUN,VERB,ADP,PROPN
feats,"OrderedDict([('Definite', 'Cons'), ('Gender', ...","OrderedDict([('Gender', 'Masc'), ('Number', 'P...","OrderedDict([('Gender', 'Masc'), ('HebBinyan',...",,
head,2,3,0,5,3
deprel,nummod,nsubj,root,case,obl
deps,,,,,


In [16]:
ud_sents = list(uddf.sent_id.unique())

## Matching sents (dev, train , test)

In [17]:
import re 
sent_id_re = re.compile('# sent_id = (\d+)')

import bz2
def get_sent_ids_lat(path):
    sent_ids = []
    with bz2.open(path, 'rt', encoding='utf-8') as f:
        for line in f:
            if line[0]=='#':
                sid = sent_id_re.match(line)
                if sid:
                    sent_id =  int(sid.group(1))
                sent_ids.append(sent_id)
    return sent_ids

dev_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-dev.heblex.conllul.bz2')
train_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-train.heblex.conllul.bz2')
test_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-test.heblex.conllul.bz2')

            

In [18]:
print(f'dev: {min(dev_sent_ids)}-{max(dev_sent_ids)}')
print(f'train: {min(train_sent_ids)}-{max(train_sent_ids)}')
print(f'test: {min(test_sent_ids)}-{max(test_sent_ids)}')

dev: 1-484
train: 485-5725
test: 5726-6216


In [19]:
uddf.groupby(['set']).sent_id.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev,1,500
test,5502,6216
train,501,5500


In [20]:
def get_set_from_sent_id(gsi):
    if gsi>=min(dev_sent_ids) and gsi<=max(dev_sent_ids):
        return 'dev'
    elif gsi>=min(train_sent_ids) and gsi<=max(train_sent_ids):
        return 'train'
    elif gsi>=min(test_sent_ids) and gsi<=max(test_sent_ids):
        return 'test'
    
uddf['ud_set'] = uddf.sent_id.apply(get_set_from_sent_id)

uddf.groupby(['ud_set']).sent_id.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
ud_set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev,1,484
test,5726,6216
train,485,5725


In [21]:
ud_dev = uddf[uddf.ud_set=='dev']
ud_train = uddf[uddf.ud_set=='train']
ud_test = uddf[uddf.ud_set=='test']

## Morpheme NER


In [56]:
import os
out_folder = 'data/ud_ner/'

def write_ncrf_file(sents, file_name, keep_pos=False, bioul=False, dummy_o=False):
    if keep_pos:
        file_name += '_pos'
    if dummy_o:
        file_name += '_dummy_o'
    if bioul:
        suffix = '.bioul'
    else:
        suffix = '.bmes'
    with open(os.path.join(out_folder, file_name+suffix), 'w', encoding='utf8') as of:
        for sent in sents:
            for fields in sent: 
                new_pos = '[POS]'+fields[1]
                line = fields[0]
                if keep_pos:
                    line += ' ' + new_pos
                if len(fields)==3:
                    tag = fields[2]
                    if bioul:
                        tag = tag.replace('E-', 'L-').replace('S-', 'U-')
                    if dummy_o:
                        tag = 'O'
                    line += ' ' + tag
                of.write(line+'\n')
            of.write('\n')
                    
        

In [63]:
fields = ['form', 'upostag', 'biose_layer0']
gold_dev_sents = bclm.get_sentences_list(ud_dev, fields=fields)
gold_train_sents = bclm.get_sentences_list(ud_train, fields=fields)
gold_test_sents =  bclm.get_sentences_list(ud_test, fields=fields)
gold_test_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[ב, ADP, O], [ה_, DET, O], [דקה, NOUN, O], [ה...
5728    [[יציאה_, NOUN, O], [_של_, ADP, O], [_הוא, PRO...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [ו, C...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [64]:
write_ncrf_file(gold_dev_sents,   'morph_gold_dev')
write_ncrf_file(gold_train_sents, 'morph_gold_train')
write_ncrf_file(gold_test_sents, 'morph_gold_test')

In [65]:
write_ncrf_file(gold_dev_sents,   'morph_gold_dev', bioul=True)
write_ncrf_file(gold_train_sents, 'morph_gold_train', bioul=True)
write_ncrf_file(gold_test_sents, 'morph_gold_test', bioul=True)

In [66]:
write_ncrf_file(gold_dev_sents,   'morph_gold_dev', dummy_o=True)
write_ncrf_file(gold_train_sents, 'morph_gold_train', dummy_o=True)
write_ncrf_file(gold_test_sents, 'morph_gold_test', dummy_o=True)

In [67]:
write_ncrf_file(gold_dev_sents,   'morph_gold_dev', dummy_o=True, bioul=True)
write_ncrf_file(gold_train_sents, 'morph_gold_train', dummy_o=True, bioul=True)
write_ncrf_file(gold_test_sents, 'morph_gold_test', dummy_o=True, bioul=True)

## Concat BIOSE Sentences

In [29]:
uddf['set'] = uddf['ud_set']
biose_concat = bclm.get_token_df(uddf, fields = ['biose_layer0', 'upostag'])
biose_concat.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,upostag,set
0,1,1,עשרות,O,NUM,dev
1,1,2,אנשים,O,NOUN,dev
2,1,3,מגיעים,O,VERB,dev
3,1,4,מתאילנד,O^S-GPE,ADP^PROPN,dev
4,1,5,לישראל,O^S-GPE,ADP^PROPN,dev


In [31]:
token_gold_dev_concat  = biose_concat[biose_concat.set=='dev']
token_gold_train_concat = biose_concat[biose_concat.set=='train']
token_gold_test_concat = biose_concat[biose_concat.set=='test']

In [32]:
token_gold_dev_concat_sents    = bclm.get_sentences_list(token_gold_dev_concat  , fields=['token_str', 'upostag', 'biose_layer0'])
token_gold_train_concat_sents  = bclm.get_sentences_list(token_gold_train_concat, fields=['token_str', 'upostag', 'biose_layer0'])
token_gold_test_concat_sents   = bclm.get_sentences_list(token_gold_test_concat , fields=['token_str', 'upostag', 'biose_layer0'])
token_gold_test_concat_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[בדקה, ADP^DET^NOUN, O^O^O], [ה, DET, O], [-,...
5728    [[יציאתו, NOUN^ADP^PRON, O^O^O], [של, ADP, O],...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [וגור...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [33]:
write_ncrf_file(token_gold_dev_concat_sents, 'token_gold_dev_concat')
write_ncrf_file(token_gold_train_concat_sents, 'token_gold_train_concat')
write_ncrf_file(token_gold_test_concat_sents, 'token_gold_test_concat')

In [39]:
write_ncrf_file(token_gold_dev_concat_sents, 'token_gold_dev_concat', bioul=True)
write_ncrf_file(token_gold_train_concat_sents, 'token_gold_train_concat', bioul=True)
write_ncrf_file(token_gold_test_concat_sents, 'token_gold_test_concat', bioul=True)

In [59]:
write_ncrf_file(token_gold_dev_concat_sents, 'token_gold_dev_concat', dummy_o=True)
write_ncrf_file(token_gold_train_concat_sents, 'token_gold_train_concat', dummy_o=True)
write_ncrf_file(token_gold_test_concat_sents, 'token_gold_test_concat', dummy_o=True)

In [60]:
write_ncrf_file(token_gold_dev_concat_sents, 'token_gold_dev_concat', dummy_o=True, bioul=True)
write_ncrf_file(token_gold_train_concat_sents, 'token_gold_train_concat', dummy_o=True, bioul=True)
write_ncrf_file(token_gold_test_concat_sents, 'token_gold_test_concat', dummy_o=True, bioul=True)

## Fix BIOSE Sentences

In [34]:
biose_fix = bclm.get_token_df(uddf, fields = ['upostag'], biose=['biose_layer0'])
biose_fix.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,upostag,set
0,1,1,עשרות,O,NUM,dev
1,1,2,אנשים,O,NOUN,dev
2,1,3,מגיעים,O,VERB,dev
3,1,4,מתאילנד,S-GPE,ADP^PROPN,dev
4,1,5,לישראל,S-GPE,ADP^PROPN,dev


In [35]:
token_gold_dev_fix  =     biose_fix[biose_fix.set=='dev']
token_gold_train_fix = biose_fix[biose_fix.set=='train']
token_gold_test_fix =  biose_fix[biose_fix.set=='test']

In [36]:
token_gold_dev_fix_sents    = bclm.get_sentences_list(token_gold_dev_fix  , fields=['token_str', 'upostag', 'biose_layer0'])
token_gold_train_fix_sents  = bclm.get_sentences_list(token_gold_train_fix, fields=['token_str', 'upostag', 'biose_layer0'])
token_gold_test_fix_sents   = bclm.get_sentences_list(token_gold_test_fix , fields=['token_str', 'upostag', 'biose_layer0'])
token_gold_test_fix_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[בדקה, ADP^DET^NOUN, O], [ה, DET, O], [-, PUN...
5728    [[יציאתו, NOUN^ADP^PRON, O], [של, ADP, O], [לי...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [וגור...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [37]:
write_ncrf_file(token_gold_dev_fix_sents, 'token_gold_dev_fix')
write_ncrf_file(token_gold_train_fix_sents, 'token_gold_train_fix')
write_ncrf_file(token_gold_test_fix_sents, 'token_gold_test_fix')

In [42]:
write_ncrf_file(token_gold_dev_fix_sents, 'token_gold_dev_fix', bioul=True)
write_ncrf_file(token_gold_train_fix_sents, 'token_gold_train_fix', bioul=True)
write_ncrf_file(token_gold_test_fix_sents, 'token_gold_test_fix', bioul=True)

In [61]:
write_ncrf_file(token_gold_dev_fix_sents, 'token_gold_dev_fix', dummy_o=True)
write_ncrf_file(token_gold_train_fix_sents, 'token_gold_train_fix', dummy_o=True)
write_ncrf_file(token_gold_test_fix_sents, 'token_gold_test_fix', dummy_o=True)

In [62]:
write_ncrf_file(token_gold_dev_fix_sents, 'token_gold_dev_fix', dummy_o=True, bioul=True)
write_ncrf_file(token_gold_train_fix_sents, 'token_gold_train_fix', dummy_o=True, bioul=True)
write_ncrf_file(token_gold_test_fix_sents, 'token_gold_test_fix', dummy_o=True, bioul=True)

## TODO
1. package
1. other layers files

In [22]:
import os
out_folder = 'data/ud_ner/nested'

def write_nested_file(sents, file_name):
    suffix = '.bmes'
    with open(os.path.join(out_folder, file_name+suffix), 'w', encoding='utf8') as of:
        for sent in sents:
            for fields in sent: 
                line = ' '.join(fields)
                of.write(line+'\n')
            of.write('\n')
                    
        

In [24]:
fields = ['form', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
a_gold_dev_sents =   bclm.get_sentences_list(ud_dev, fields=fields)
a_gold_train_sents = bclm.get_sentences_list(ud_train, fields=fields)
a_gold_test_sents =  bclm.get_sentences_list(ud_test, fields=fields)
a_gold_test_sents.head()

sent_id
5726    [[הולקומב, S-PER, O, O, O], [לא, O, O, O, O], ...
5727    [[ב, O, O, O, O], [ה_, O, O, O, O], [דקה, O, O...
5728    [[יציאה_, O, O, O, O], [_של_, O, O, O, O], [_ה...
5729    [[אולם, O, O, O, O], [קמבל, S-PER, O, O, O], [...
5730    [[מלכתחילה, O, O, O, O], [היה, O, O, O, O], [ב...
dtype: object

In [25]:
write_nested_file(a_gold_dev_sents,   'morph_gold_dev')
write_nested_file(a_gold_train_sents, 'morph_gold_train')
write_nested_file(a_gold_test_sents,  'morph_gold_test')

## Concat BIOSE Sentences

In [26]:
a_biose_concat = bclm.get_token_df(uddf, fields = ['biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3'])

In [27]:
a_token_gold_dev_concat  =  a_biose_concat[a_biose_concat.set=='dev']
a_token_gold_train_concat = a_biose_concat[a_biose_concat.set=='train']
a_token_gold_test_concat =  a_biose_concat[a_biose_concat.set=='test']

In [28]:
fields = ['token_str', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
a_token_gold_dev_concat_sents    = bclm.get_sentences_list(a_token_gold_dev_concat  , fields=fields)
a_token_gold_train_concat_sents  = bclm.get_sentences_list(a_token_gold_train_concat, fields=fields)
a_token_gold_test_concat_sents   = bclm.get_sentences_list(a_token_gold_test_concat , fields=fields)
a_token_gold_test_concat_sents.head()

sent_id
5502    [[הכל, O, O, O, O], [נושאים, O, O, O, O], [עמם...
5503    [[אומר, O, O, O, O], [מזכיר, O, O, O, O], [התק...
5504    [[לא, O, O, O, O], [ייתכן, O, O, O, O], [שעולה...
5505    [[לא, O, O, O, O], [ייתכן, O, O, O, O], [שהוא,...
5506    [[לכן, O, O, O, O], [קבענו, O, O, O, O], [עיקר...
dtype: object

In [29]:
write_nested_file(a_token_gold_dev_concat_sents,   'token-multi_gold_dev')
write_nested_file(a_token_gold_train_concat_sents, 'token-multi_gold_train')
write_nested_file(a_token_gold_test_concat_sents,  'token-multi_gold_test')

## Fix BIOSE Sentences

In [30]:
a_biose_fix = bclm.get_token_df(uddf, fields = ['upostag'], biose=['biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3'])

In [31]:
a_biose_fix.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,biose_layer1,biose_layer2,biose_layer3,upostag,set
0,1,1,עשרות,O,O,O,O,NUM,dev
1,1,2,אנשים,O,O,O,O,NOUN,dev
2,1,3,מגיעים,O,O,O,O,VERB,dev
3,1,4,מתאילנד,S-GPE,O,O,O,ADP^PROPN,dev
4,1,5,לישראל,S-GPE,O,O,O,ADP^PROPN,dev


In [32]:
a_token_gold_dev_fix  =  a_biose_fix[a_biose_fix.set=='dev']
a_token_gold_train_fix = a_biose_fix[a_biose_fix.set=='train']
a_token_gold_test_fix =  a_biose_fix[a_biose_fix.set=='test']

In [33]:
fields = ['token_str', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
a_token_gold_dev_fix_sents    = bclm.get_sentences_list(a_token_gold_dev_fix  , fields=fields)
a_token_gold_train_fix_sents  = bclm.get_sentences_list(a_token_gold_train_fix, fields=fields)
a_token_gold_test_fix_sents   = bclm.get_sentences_list(a_token_gold_test_fix , fields=fields)
a_token_gold_test_fix_sents.head()

sent_id
5502    [[הכל, O, O, O, O], [נושאים, O, O, O, O], [עמם...
5503    [[אומר, O, O, O, O], [מזכיר, O, O, O, O], [התק...
5504    [[לא, O, O, O, O], [ייתכן, O, O, O, O], [שעולה...
5505    [[לא, O, O, O, O], [ייתכן, O, O, O, O], [שהוא,...
5506    [[לכן, O, O, O, O], [קבענו, O, O, O, O], [עיקר...
dtype: object

In [34]:
write_nested_file(a_token_gold_dev_fix_sents,   'token-single_gold_dev')
write_nested_file(a_token_gold_train_fix_sents, 'token-single_gold_train')
write_nested_file(a_token_gold_test_fix_sents,  'token-single_gold_test')