In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [45]:
import bclm

In [5]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')

In [6]:
global_dropped = [spdf[spdf.sent_id==d].global_sent_id.iat[0] for d in dropped]
global_dropped

[5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]

In [7]:
uddf = bclm.read_dataframe('ud')

In [8]:
from collections import OrderedDict

In [9]:
ud_sents = list(uddf.sent_id.unique())

## Matching sents (dev, train , test)

In [10]:
import re 
sent_id_re = re.compile('# sent_id = (\d+)')

import bz2
def get_sent_ids_lat(path):
    sent_ids = []
    with bz2.open(path, 'rt', encoding='utf-8') as f:
        for line in f:
            if line[0]=='#':
                sid = sent_id_re.match(line)
                if sid:
                    sent_id =  int(sid.group(1))
                sent_ids.append(sent_id)
    return sent_ids

dev_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-dev.heblex.conllul.bz2')
train_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-train.heblex.conllul.bz2')
test_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-test.heblex.conllul.bz2')

            

In [11]:
print(f'dev: {min(dev_sent_ids)}-{max(dev_sent_ids)}')
print(f'train: {min(train_sent_ids)}-{max(train_sent_ids)}')
print(f'test: {min(test_sent_ids)}-{max(test_sent_ids)}')

dev: 1-484
train: 485-5725
test: 5726-6216


In [12]:
uddf.groupby(['set']).sent_id.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev,1,500
test,5501,6216
train,501,5500


In [13]:
def get_set_from_sent_id(gsi):
    if gsi>=min(dev_sent_ids) and gsi<=max(dev_sent_ids):
        return 'dev'
    elif gsi>=min(train_sent_ids) and gsi<=max(train_sent_ids):
        return 'train'
    elif gsi>=min(test_sent_ids) and gsi<=max(test_sent_ids):
        return 'test'
    
uddf['ud_set'] = uddf.sent_id.apply(get_set_from_sent_id)

uddf.groupby(['ud_set']).sent_id.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
ud_set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev,1,484
test,5726,6216
train,485,5725


In [14]:
import re
def get_biose(df):
    ner_bio = df.ner.copy()
    ner_bio[~ner_bio.str.contains('\[|_')] = 'S-' + ner_bio 
    ner_bio = ner_bio.replace('_', 'O')

    RE = '^(?P<type>[A-Z]+)\[(?P<num>\d+)\].*$'
    prev='XXX'
    new = []

    for x in ner_bio.tolist():
        if x.startswith('S') or x=='O':
            new.append(x)
        else:
            typ, num = re.match(RE, x).groups(0)
            if x==prev:
                new.append('I-'+typ)
            else:
                new.append('B-'+typ)

        prev = x
        
    # reverse pass to add Es
    prev='O'
    rev_pass = []
    for x in new[::-1]:
        if((prev=='O' or prev.startswith('B-')) and x.startswith('I-')):
            rev_pass.append(x.replace('I-', 'E-'))
        else:
            rev_pass.append(x)
        prev = x

    biose = rev_pass[::-1]
    
    return biose

In [15]:
def read_tsv3(path, add_biose=True):
    names = ['sent_tok_num', 'tok_offset', 'token',
             'FEAT_animacy', 'FEAT_aspect', 'FEAT_case', 'FEAT_definiteness', 
             'FEAT_degree', 'FEAT_gender', 'FEAT_mood', 'FEAT_negative', 'FEAT_numType',
             'FEAT_number', 'FEAT_person', 'FEAT_possessive', 'FEAT_pronType', 'FEAT_reflex', 'FEAT_tense', 
             'FEAT_transitivity', 'FEAT_value', 'FEAT_verbForm', 'FEAT_voice',
             'pos','pos_coarse',
             'ner_id', 'ner',
             'lemma',
             'surface_form',
             'dep_type', 'dep_flavor', 'dep_lex_morph_pos','dep_arc', #'EXTRA'
            ]
    df = (pd.read_csv(path, sep='\t', skiprows=10, header=None, names=names,
                      comment='#', skip_blank_lines=True, quoting=3)#na_values=['*', '_'])
          .assign(ner_type = lambda x: x.ner.str.split('[', expand=True).iloc[:,0])
          .assign(is_ner = lambda x: x.ner!='_')
          #.assign(biose = lambda x: get_biose(x))
         )
    return df


In [16]:
from ner_transforms import *


## fixes

In [17]:
df = read_tsv3('data/ab_annotations/train_485-600.conllu/dafnaa.tsv')
df.head()

Unnamed: 0,sent_tok_num,tok_offset,token,FEAT_animacy,FEAT_aspect,FEAT_case,FEAT_definiteness,FEAT_degree,FEAT_gender,FEAT_mood,...,ner_id,ner,lemma,surface_form,dep_type,dep_flavor,dep_lex_morph_pos,dep_arc,ner_type,is_ner
0,1-1,0-1,ה,*,*,*,*,*,*,*,...,_,_,ה,הקהל[98],det:def,basic,1-2,,_,False
1,1-2,2-5,קהל,*,*,*,*,*,Masc,*,...,_,_,קהל,הקהל[98],nsubj,basic,1-3,,_,False
2,1-3,6-10,איתר,*,*,*,*,*,Masc,*,...,_,_,איתר,_,root,basic,1-3,,_,False
3,1-4,11-14,סוף,*,*,*,*,*,Masc,*,...,_,_,סוף,_,advmod,basic,1-3,,_,False
4,1-5,15-18,סוף,*,*,*,*,*,Masc,*,...,_,_,סוף,_,fixed,basic,1-4,,_,False


In [18]:
def apply_fixes(name, df):
    if name=='train_485-600.conllu':
        df.loc[df.sent_tok_num=='85-15', 'ner'] = 'ORG[81]|GPE[82]'

## Read all sents for A,B

In [19]:
import os
from collections import defaultdict 
tators = defaultdict(list)
for folder in os.scandir('data/ab_annotations/'):
    if '.ipynb' in folder.name:
        continue
    first_sent_id = int(folder.name.split('.')[0].split('_')[1].split('-')[0])
    for file in os.scandir(folder):
        if '.ipynb' in file.name:
            continue
        print (file.path)
        tat = file.name.split('.')[0]
        df = read_tsv3(file.path)
        apply_fixes(folder.name, df)
        df[['sent_id','id']] = df.sent_tok_num.str.split('-',expand=True).astype(int)
        
        df['sent_id'] = df.sent_id + first_sent_id - 1
        df['file'] = folder.name
        layers = get_all_layers_biose(df)
        df = df.merge(layers, how='left', on=['sent_id', 'id'])
        tators[tat].append(df)
        
for tat in tators:
    tators[tat] = pd.concat(tators[tat], sort=False)
    tators[tat] = tators[tat].sort_values(by=['sent_id', 'id'])

data/ab_annotations/dev_1-100.conllu/dafnaa.tsv
data/ab_annotations/dev_1-100.conllu/zefs.tsv
data/ab_annotations/dev_101-200.conllu/dafnaa.tsv
data/ab_annotations/dev_101-200.conllu/zefs.tsv
data/ab_annotations/dev_201-300.conllu/dafnaa.tsv
data/ab_annotations/dev_201-300.conllu/zefs.tsv
data/ab_annotations/dev_301-400.conllu/dafnaa.tsv
data/ab_annotations/dev_301-400.conllu/zefs.tsv
data/ab_annotations/dev_401-484.conllu/dafnaa.tsv
data/ab_annotations/dev_401-484.conllu/zefs.tsv
data/ab_annotations/test_5726-5800.conllu/dafnaa.tsv
data/ab_annotations/test_5726-5800.conllu/zefs.tsv
data/ab_annotations/test_5801-5900.conllu/dafnaa.tsv
data/ab_annotations/test_5801-5900.conllu/zefs.tsv
data/ab_annotations/test_5901-6000.conllu/dafnaa.tsv
data/ab_annotations/test_5901-6000.conllu/zefs.tsv
data/ab_annotations/test_6001-6100.conllu/dafnaa.tsv
data/ab_annotations/test_6001-6100.conllu/zefs.tsv
data/ab_annotations/test_6101-6200.conllu/dafnaa.tsv
data/ab_annotations/test_6101-6200.conllu/zef

In [20]:
tators['dafnaa'].tail().T

Unnamed: 0,403,404,405,406,407
sent_tok_num,16-5,16-6,16-7,16-8,16-9
tok_offset,1640-1642,1643-1645,1646-1647,1648-1653,1654-1655
token,אך,לא,ה,תרופה,.
FEAT_animacy,_,*,*,*,_
FEAT_aspect,_,*,*,*,_
FEAT_case,_,*,*,*,_
FEAT_definiteness,_,*,*,*,_
FEAT_degree,_,*,*,*,_
FEAT_gender,_,*,*,Fem,_
FEAT_mood,_,*,*,*,_


## align with UD curation output files


In [21]:
tators['dafnaa'].shape, tators['zefs'].shape

((161417, 41), (161417, 40))

In [22]:
uddf.shape

(160379, 28)

In [23]:
tators['dafnaa'] = tators['dafnaa'][tators['dafnaa'].sent_id.isin(uddf.sent_id.unique())]
tators['zefs']   = tators['zefs'][tators['zefs'].sent_id.isin(uddf.sent_id.unique())]

tators['dafnaa'].shape, tators['zefs'].shape

((160379, 41), (160379, 40))

In [24]:
tators['dafnaa'] = tators['dafnaa'].reset_index(drop=True)
tators['zefs']   = tators['zefs'].reset_index(drop=True)

In [25]:
uddf.head().T

Unnamed: 0,0,1,2,3,4
sent_id,1,1,1,1,1
id,1,2,3,4,5
form,עשרות,אנשים,מגיעים,מ,תאילנד
lemma,עשרות,איש,הגיע,מ,תאילנד
upostag,NUM,NOUN,VERB,ADP,PROPN
xpostag,NUM,NOUN,VERB,ADP,PROPN
feats,"OrderedDict([('Definite', 'Cons'), ('Gender', ...","OrderedDict([('Gender', 'Masc'), ('Number', 'P...","OrderedDict([('Gender', 'Masc'), ('HebBinyan',...",,
head,2,3,0,5,3
deprel,nummod,nsubj,root,case,obl
deps,,,,,


In [26]:
uddf.columns

Index(['sent_id', 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head',
       'deprel', 'deps', 'misc', 'global_sent_id', 'token_id', 'token_str',
       'token_morph_id', 'binyan', 'biose', 'ner', 'ner_layers', 'ner_escaped',
       'duplicate_sent_id', 'very_similar_sent_id', 'biose_layer0',
       'biose_layer1', 'biose_layer2', 'biose_layer3', 'set', 'ud_set'],
      dtype='object')

In [27]:
tators['dafnaa'].columns

Index(['sent_tok_num', 'tok_offset', 'token', 'FEAT_animacy', 'FEAT_aspect',
       'FEAT_case', 'FEAT_definiteness', 'FEAT_degree', 'FEAT_gender',
       'FEAT_mood', 'FEAT_negative', 'FEAT_numType', 'FEAT_number',
       'FEAT_person', 'FEAT_possessive', 'FEAT_pronType', 'FEAT_reflex',
       'FEAT_tense', 'FEAT_transitivity', 'FEAT_value', 'FEAT_verbForm',
       'FEAT_voice', 'pos', 'pos_coarse', 'ner_id', 'ner', 'lemma',
       'surface_form', 'dep_type', 'dep_flavor', 'dep_lex_morph_pos',
       'dep_arc', 'ner_type', 'is_ner', 'sent_id', 'id', 'file',
       'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3'],
      dtype='object')

In [28]:
def merge_uddf(uddf, df):
    keep_cols = ['sent_id', 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head',
       'deprel', 'deps', 'misc', 'global_sent_id', 'token_id', 'token_str',
       'token_morph_id', 'binyan', 
       'duplicate_sent_id', 'very_similar_sent_id', 'set', 'ud_set']
    tat_cols = ['ner', 'ner_type', 'is_ner', 'biose_layer0', 'biose_layer1', 'biose_layer2']
    if 'biose_layer3' in df.columns:
        tat_cols.append('biose_layer3')
    n = pd.concat([uddf[keep_cols], df[tat_cols]], axis=1)
    return n

a = merge_uddf(uddf, tators['dafnaa'])
b = merge_uddf(uddf, tators['zefs'])


In [29]:
a.head()

Unnamed: 0,sent_id,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,...,very_similar_sent_id,set,ud_set,ner,ner_type,is_ner,biose_layer0,biose_layer1,biose_layer2,biose_layer3
0,1,1,עשרות,עשרות,NUM,NUM,"OrderedDict([('Definite', 'Cons'), ('Gender', ...",2,nummod,,...,,dev,dev,_,_,False,O,O,,
1,1,2,אנשים,איש,NOUN,NOUN,"OrderedDict([('Gender', 'Masc'), ('Number', 'P...",3,nsubj,,...,,dev,dev,_,_,False,O,O,,
2,1,3,מגיעים,הגיע,VERB,VERB,"OrderedDict([('Gender', 'Masc'), ('HebBinyan',...",0,root,,...,,dev,dev,_,_,False,O,O,,
3,1,4,מ,מ,ADP,ADP,,5,case,,...,,dev,dev,_,_,False,O,O,,
4,1,5,תאילנד,תאילנד,PROPN,PROPN,,3,obl,,...,,dev,dev,GPE,GPE,True,S-GPE,O,,


In [30]:
b.head()

Unnamed: 0,sent_id,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,...,duplicate_sent_id,very_similar_sent_id,set,ud_set,ner,ner_type,is_ner,biose_layer0,biose_layer1,biose_layer2
0,1,1,עשרות,עשרות,NUM,NUM,"OrderedDict([('Definite', 'Cons'), ('Gender', ...",2,nummod,,...,,,dev,dev,_,_,False,O,O,
1,1,2,אנשים,איש,NOUN,NOUN,"OrderedDict([('Gender', 'Masc'), ('Number', 'P...",3,nsubj,,...,,,dev,dev,_,_,False,O,O,
2,1,3,מגיעים,הגיע,VERB,VERB,"OrderedDict([('Gender', 'Masc'), ('HebBinyan',...",0,root,,...,,,dev,dev,_,_,False,O,O,
3,1,4,מ,מ,ADP,ADP,,5,case,,...,,,dev,dev,_,_,False,O,O,
4,1,5,תאילנד,תאילנד,PROPN,PROPN,,3,obl,,...,,,dev,dev,GPE,GPE,True,S-GPE,O,


In [31]:
b['biose_layer3'] = 'O'

In [32]:
a[['biose_layer'+str(x) for x in range(4)]] = a[['biose_layer'+str(x) for x in range(4)]].fillna('O')

In [33]:
b[['biose_layer'+str(x) for x in range(4)]] = b[['biose_layer'+str(x) for x in range(4)]].fillna('O')

## Create variant files for each

In [93]:
import os
out_folder = 'data/ud_ner/ab_annotations/'

def write_ncrf_file(sents, file_name, keep_pos=False, bioul=False, dummy_o=False):
    if keep_pos:
        file_name += '_pos'
    if dummy_o:
        file_name += '_dummy_o'
    if bioul:
        suffix = '.bioul'
    else:
        suffix = '.bmes'
    with open(os.path.join(out_folder, file_name+suffix), 'w', encoding='utf8') as of:
        for sent in sents:
            for fields in sent: 
                new_pos = '[POS]'+fields[1]
                line = fields[0]
                if keep_pos:
                    line += ' ' + new_pos
                if len(fields)==3:
                    tag = fields[2]
                    if bioul:
                        tag = tag.replace('E-', 'L-').replace('S-', 'U-')
                    if dummy_o:
                        tag = 'O'
                    line += ' ' + tag
                of.write(line+'\n')
            of.write('\n')
                    
        

In [46]:
a_ud_dev = a[a.ud_set=='dev']
a_ud_train = a[a.ud_set=='train']
a_ud_test = a[a.ud_set=='test']

In [47]:
b_ud_dev = b[b.ud_set=='dev']
b_ud_train = b[b.ud_set=='train']
b_ud_test = b[b.ud_set=='test']

In [96]:
fields = ['form', 'upostag', 'biose_layer0']
a_gold_dev_sents =   bclm.get_sentences_list(a_ud_dev, fields=fields)
a_gold_train_sents = bclm.get_sentences_list(a_ud_train, fields=fields)
a_gold_test_sents =  bclm.get_sentences_list(a_ud_test, fields=fields)
a_gold_test_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[ב, ADP, O], [ה_, DET, O], [דקה, NOUN, O], [ה...
5728    [[יציאה_, NOUN, O], [_של_, ADP, O], [_הוא, PRO...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [ו, C...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [99]:
fields = ['form', 'upostag', 'biose_layer0']
b_gold_dev_sents =   bclm.get_sentences_list(b_ud_dev, fields=fields)
b_gold_train_sents = bclm.get_sentences_list(b_ud_train, fields=fields)
b_gold_test_sents =  bclm.get_sentences_list(b_ud_test, fields=fields)
b_gold_test_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[ב, ADP, O], [ה_, DET, O], [דקה, NOUN, O], [ה...
5728    [[יציאה_, NOUN, O], [_של_, ADP, O], [_הוא, PRO...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [ו, C...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [100]:
write_ncrf_file(a_gold_dev_sents,   'a_morph_gold_dev')
write_ncrf_file(a_gold_train_sents, 'a_morph_gold_train')
write_ncrf_file(a_gold_test_sents,  'a_morph_gold_test')

In [101]:
write_ncrf_file(b_gold_dev_sents,   'b_morph_gold_dev')
write_ncrf_file(b_gold_train_sents, 'b_morph_gold_train')
write_ncrf_file(b_gold_test_sents,  'b_morph_gold_test')

## Concat BIOSE Sentences

In [102]:
a['set'] = a['ud_set']
b['set'] = b['ud_set']

a_biose_concat = bclm.get_token_df(a, fields = ['biose_layer0', 'upostag'])
b_biose_concat = bclm.get_token_df(b, fields = ['biose_layer0', 'upostag'])

b_biose_concat.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,upostag,set
0,1,1,עשרות,O,NUM,dev
1,1,2,אנשים,O,NOUN,dev
2,1,3,מגיעים,O,VERB,dev
3,1,4,מתאילנד,O^S-GPE,ADP^PROPN,dev
4,1,5,לישראל,O^S-GPE,ADP^PROPN,dev


In [104]:
a_token_gold_dev_concat  =  a_biose_concat[a_biose_concat.set=='dev']
a_token_gold_train_concat = a_biose_concat[a_biose_concat.set=='train']
a_token_gold_test_concat =  a_biose_concat[a_biose_concat.set=='test']

In [103]:
b_token_gold_dev_concat  =  b_biose_concat[b_biose_concat.set=='dev']
b_token_gold_train_concat = b_biose_concat[b_biose_concat.set=='train']
b_token_gold_test_concat =  b_biose_concat[b_biose_concat.set=='test']

In [105]:
a_token_gold_dev_concat_sents    = bclm.get_sentences_list(a_token_gold_dev_concat  , fields=['token_str', 'upostag', 'biose_layer0'])
a_token_gold_train_concat_sents  = bclm.get_sentences_list(a_token_gold_train_concat, fields=['token_str', 'upostag', 'biose_layer0'])
a_token_gold_test_concat_sents   = bclm.get_sentences_list(a_token_gold_test_concat , fields=['token_str', 'upostag', 'biose_layer0'])
a_token_gold_test_concat_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[בדקה, ADP^DET^NOUN, O^O^O], [ה, DET, O], [-,...
5728    [[יציאתו, NOUN^ADP^PRON, O^O^O], [של, ADP, O],...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [וגור...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [106]:
b_token_gold_dev_concat_sents    = bclm.get_sentences_list(b_token_gold_dev_concat  , fields=['token_str', 'upostag', 'biose_layer0'])
b_token_gold_train_concat_sents  = bclm.get_sentences_list(b_token_gold_train_concat, fields=['token_str', 'upostag', 'biose_layer0'])
b_token_gold_test_concat_sents   = bclm.get_sentences_list(b_token_gold_test_concat , fields=['token_str', 'upostag', 'biose_layer0'])
b_token_gold_test_concat_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[בדקה, ADP^DET^NOUN, O^O^O], [ה, DET, O], [-,...
5728    [[יציאתו, NOUN^ADP^PRON, O^O^O], [של, ADP, O],...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [וגור...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [107]:
write_ncrf_file(a_token_gold_dev_concat_sents,   'a_token_gold_dev_concat')
write_ncrf_file(a_token_gold_train_concat_sents, 'a_token_gold_train_concat')
write_ncrf_file(a_token_gold_test_concat_sents,  'a_token_gold_test_concat')

In [108]:
write_ncrf_file(b_token_gold_dev_concat_sents,   'b_token_gold_dev_concat')
write_ncrf_file(b_token_gold_train_concat_sents, 'b_token_gold_train_concat')
write_ncrf_file(b_token_gold_test_concat_sents,  'b_token_gold_test_concat')

## Fix BIOSE Sentences

In [110]:
a_biose_fix = bclm.get_token_df(a, fields = ['upostag'], biose=['biose_layer0'])
b_biose_fix = bclm.get_token_df(b, fields = ['upostag'], biose=['biose_layer0'])

a_biose_fix.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,upostag,set
0,1,1,עשרות,O,NUM,dev
1,1,2,אנשים,O,NOUN,dev
2,1,3,מגיעים,O,VERB,dev
3,1,4,מתאילנד,S-GPE,ADP^PROPN,dev
4,1,5,לישראל,S-GPE,ADP^PROPN,dev


In [111]:
a_token_gold_dev_fix  =  a_biose_fix[a_biose_fix.set=='dev']
a_token_gold_train_fix = a_biose_fix[a_biose_fix.set=='train']
a_token_gold_test_fix =  a_biose_fix[a_biose_fix.set=='test']

In [112]:
b_token_gold_dev_fix  =  b_biose_fix[b_biose_fix.set=='dev']
b_token_gold_train_fix = b_biose_fix[b_biose_fix.set=='train']
b_token_gold_test_fix =  b_biose_fix[b_biose_fix.set=='test']

In [113]:
a_token_gold_dev_fix_sents    = bclm.get_sentences_list(a_token_gold_dev_fix  , fields=['token_str', 'upostag', 'biose_layer0'])
a_token_gold_train_fix_sents  = bclm.get_sentences_list(a_token_gold_train_fix, fields=['token_str', 'upostag', 'biose_layer0'])
a_token_gold_test_fix_sents   = bclm.get_sentences_list(a_token_gold_test_fix , fields=['token_str', 'upostag', 'biose_layer0'])
a_token_gold_test_fix_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[בדקה, ADP^DET^NOUN, O], [ה, DET, O], [-, PUN...
5728    [[יציאתו, NOUN^ADP^PRON, O], [של, ADP, O], [לי...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [וגור...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [114]:
b_token_gold_dev_fix_sents    = bclm.get_sentences_list(b_token_gold_dev_fix  , fields=['token_str', 'upostag', 'biose_layer0'])
b_token_gold_train_fix_sents  = bclm.get_sentences_list(b_token_gold_train_fix, fields=['token_str', 'upostag', 'biose_layer0'])
b_token_gold_test_fix_sents   = bclm.get_sentences_list(b_token_gold_test_fix , fields=['token_str', 'upostag', 'biose_layer0'])
b_token_gold_test_fix_sents.head()

sent_id
5726    [[הולקומב, PROPN, S-PER], [לא, ADV, O], [הזכיר...
5727    [[בדקה, ADP^DET^NOUN, O], [ה, DET, O], [-, PUN...
5728    [[יציאתו, NOUN^ADP^PRON, O], [של, ADP, O], [לי...
5729    [[אולם, CCONJ, O], [קמבל, PROPN, S-PER], [וגור...
5730    [[מלכתחילה, ADV, O], [היה, AUX, O], [ברור, VER...
dtype: object

In [115]:
write_ncrf_file(a_token_gold_dev_fix_sents,   'a_token_gold_dev_fix')
write_ncrf_file(a_token_gold_train_fix_sents, 'a_token_gold_train_fix')
write_ncrf_file(a_token_gold_test_fix_sents,  'a_token_gold_test_fix')

In [116]:
write_ncrf_file(b_token_gold_dev_fix_sents,   'b_token_gold_dev_fix')
write_ncrf_file(b_token_gold_train_fix_sents, 'b_token_gold_train_fix')
write_ncrf_file(b_token_gold_test_fix_sents,  'b_token_gold_test_fix')

## TODO
1. package
1. other layers files

In [35]:
import os
out_folder = 'data/ud_ner/ab_annotations/nested'

def write_nested_file(sents, file_name):
    suffix = '.bmes'
    with open(os.path.join(out_folder, file_name+suffix), 'w', encoding='utf8') as of:
        for sent in sents:
            for fields in sent: 
                line = ' '.join(fields)
                of.write(line+'\n')
            of.write('\n')
                    
        

In [48]:
fields = ['form', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
a_gold_dev_sents =   bclm.get_sentences_list(a_ud_dev, fields=fields)
a_gold_train_sents = bclm.get_sentences_list(a_ud_train, fields=fields)
a_gold_test_sents =  bclm.get_sentences_list(a_ud_test, fields=fields)
a_gold_test_sents.head()

sent_id
5726    [[הולקומב, S-PER, O, O, O], [לא, O, O, O, O], ...
5727    [[ב, O, O, O, O], [ה_, O, O, O, O], [דקה, O, O...
5728    [[יציאה_, O, O, O, O], [_של_, O, O, O, O], [_ה...
5729    [[אולם, O, O, O, O], [קמבל, S-PER, O, O, O], [...
5730    [[מלכתחילה, O, O, O, O], [היה, O, O, O, O], [ב...
dtype: object

In [49]:
fields = ['form', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
b_gold_dev_sents =   bclm.get_sentences_list(b_ud_dev, fields=fields)
b_gold_train_sents = bclm.get_sentences_list(b_ud_train, fields=fields)
b_gold_test_sents =  bclm.get_sentences_list(b_ud_test, fields=fields)
b_gold_test_sents.head()

sent_id
5726    [[הולקומב, S-PER, O, O, O], [לא, O, O, O, O], ...
5727    [[ב, O, O, O, O], [ה_, O, O, O, O], [דקה, O, O...
5728    [[יציאה_, O, O, O, O], [_של_, O, O, O, O], [_ה...
5729    [[אולם, O, O, O, O], [קמבל, S-PER, O, O, O], [...
5730    [[מלכתחילה, O, O, O, O], [היה, O, O, O, O], [ב...
dtype: object

In [51]:
write_nested_file(a_gold_dev_sents,   'a_morph_gold_dev')
write_nested_file(a_gold_train_sents, 'a_morph_gold_train')
write_nested_file(a_gold_test_sents,  'a_morph_gold_test')

In [52]:
write_nested_file(b_gold_dev_sents,   'b_morph_gold_dev')
write_nested_file(b_gold_train_sents, 'b_morph_gold_train')
write_nested_file(b_gold_test_sents,  'b_morph_gold_test')

## Concat BIOSE Sentences

In [34]:
a['set'] = a['ud_set']
b['set'] = b['ud_set']

In [53]:
a_biose_concat = bclm.get_token_df(a, fields = ['biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3'])
b_biose_concat = bclm.get_token_df(b, fields = ['biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3'])

b_biose_concat.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,biose_layer1,biose_layer2,biose_layer3,set
0,1,1,עשרות,O,O,O,O,dev
1,1,2,אנשים,O,O,O,O,dev
2,1,3,מגיעים,O,O,O,O,dev
3,1,4,מתאילנד,O^S-GPE,O^O,O^O,O^O,dev
4,1,5,לישראל,O^S-GPE,O^O,O^O,O^O,dev


In [54]:
a_token_gold_dev_concat  =  a_biose_concat[a_biose_concat.set=='dev']
a_token_gold_train_concat = a_biose_concat[a_biose_concat.set=='train']
a_token_gold_test_concat =  a_biose_concat[a_biose_concat.set=='test']

In [55]:
b_token_gold_dev_concat  =  b_biose_concat[b_biose_concat.set=='dev']
b_token_gold_train_concat = b_biose_concat[b_biose_concat.set=='train']
b_token_gold_test_concat =  b_biose_concat[b_biose_concat.set=='test']

In [56]:
fields = ['token_str', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
a_token_gold_dev_concat_sents    = bclm.get_sentences_list(a_token_gold_dev_concat  , fields=fields)
a_token_gold_train_concat_sents  = bclm.get_sentences_list(a_token_gold_train_concat, fields=fields)
a_token_gold_test_concat_sents   = bclm.get_sentences_list(a_token_gold_test_concat , fields=fields)
a_token_gold_test_concat_sents.head()

sent_id
5726    [[הולקומב, S-PER, O, O, O], [לא, O, O, O, O], ...
5727    [[בדקה, O^O^O, O^O^O, O^O^O, O^O^O], [ה, O, O,...
5728    [[יציאתו, O^O^O, O^O^O, O^O^O, O^O^O], [של, O,...
5729    [[אולם, O, O, O, O], [קמבל, S-PER, O, O, O], [...
5730    [[מלכתחילה, O, O, O, O], [היה, O, O, O, O], [ב...
dtype: object

In [57]:
fields = ['token_str', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
b_token_gold_dev_concat_sents    = bclm.get_sentences_list(b_token_gold_dev_concat  , fields=fields)
b_token_gold_train_concat_sents  = bclm.get_sentences_list(b_token_gold_train_concat, fields=fields)
b_token_gold_test_concat_sents   = bclm.get_sentences_list(b_token_gold_test_concat , fields=fields)
b_token_gold_test_concat_sents.head()

sent_id
5726    [[הולקומב, S-PER, O, O, O], [לא, O, O, O, O], ...
5727    [[בדקה, O^O^O, O^O^O, O^O^O, O^O^O], [ה, O, O,...
5728    [[יציאתו, O^O^O, O^O^O, O^O^O, O^O^O], [של, O,...
5729    [[אולם, O, O, O, O], [קמבל, S-PER, O, O, O], [...
5730    [[מלכתחילה, O, O, O, O], [היה, O, O, O, O], [ב...
dtype: object

In [58]:
write_nested_file(a_token_gold_dev_concat_sents,   'a_token-multi_gold_dev')
write_nested_file(a_token_gold_train_concat_sents, 'a_token-multi_gold_train')
write_nested_file(a_token_gold_test_concat_sents,  'a_token-multi_gold_test')

In [59]:
write_nested_file(b_token_gold_dev_concat_sents,   'b_token-multi_gold_dev')
write_nested_file(b_token_gold_train_concat_sents, 'b_token-multi_gold_train')
write_nested_file(b_token_gold_test_concat_sents,  'b_token-multi_gold_test')

## Fix BIOSE Sentences

In [48]:
a_biose_fix = bclm.get_token_df(a, fields = ['upostag'], biose=['biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3'])

In [46]:
b_biose_fix = bclm.get_token_df(b, fields = ['upostag'], biose=['biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3'])

In [49]:
b_biose_fix.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,biose_layer1,biose_layer2,biose_layer3,upostag,set
0,1,1,עשרות,O,O,O,O,NUM,dev
1,1,2,אנשים,O,O,O,O,NOUN,dev
2,1,3,מגיעים,O,O,O,O,VERB,dev
3,1,4,מתאילנד,S-GPE,O,O,O,ADP^PROPN,dev
4,1,5,לישראל,S-GPE,O,O,O,ADP^PROPN,dev


In [50]:
a_token_gold_dev_fix  =  a_biose_fix[a_biose_fix.set=='dev']
a_token_gold_train_fix = a_biose_fix[a_biose_fix.set=='train']
a_token_gold_test_fix =  a_biose_fix[a_biose_fix.set=='test']

In [51]:
b_token_gold_dev_fix  =  b_biose_fix[b_biose_fix.set=='dev']
b_token_gold_train_fix = b_biose_fix[b_biose_fix.set=='train']
b_token_gold_test_fix =  b_biose_fix[b_biose_fix.set=='test']

In [52]:
fields = ['token_str', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
a_token_gold_dev_fix_sents    = bclm.get_sentences_list(a_token_gold_dev_fix  , fields=fields)
a_token_gold_train_fix_sents  = bclm.get_sentences_list(a_token_gold_train_fix, fields=fields)
a_token_gold_test_fix_sents   = bclm.get_sentences_list(a_token_gold_test_fix , fields=fields)
a_token_gold_test_fix_sents.head()

sent_id
5726    [[הולקומב, S-PER, O, O, O], [לא, O, O, O, O], ...
5727    [[בדקה, O, O, O, O], [ה, O, O, O, O], [-, O, O...
5728    [[יציאתו, O, O, O, O], [של, O, O, O, O], [ליף,...
5729    [[אולם, O, O, O, O], [קמבל, S-PER, O, O, O], [...
5730    [[מלכתחילה, O, O, O, O], [היה, O, O, O, O], [ב...
dtype: object

In [53]:
fields = ['token_str', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
b_token_gold_dev_fix_sents    = bclm.get_sentences_list(b_token_gold_dev_fix  , fields=fields)
b_token_gold_train_fix_sents  = bclm.get_sentences_list(b_token_gold_train_fix, fields=fields)
b_token_gold_test_fix_sents   = bclm.get_sentences_list(b_token_gold_test_fix , fields=fields)
b_token_gold_test_fix_sents.head()

sent_id
5726    [[הולקומב, S-PER, O, O, O], [לא, O, O, O, O], ...
5727    [[בדקה, O, O, O, O], [ה, O, O, O, O], [-, O, O...
5728    [[יציאתו, O, O, O, O], [של, O, O, O, O], [ליף,...
5729    [[אולם, O, O, O, O], [קמבל, S-PER, O, O, O], [...
5730    [[מלכתחילה, O, O, O, O], [היה, O, O, O, O], [ב...
dtype: object

In [54]:
write_nested_file(a_token_gold_dev_fix_sents,   'a_token-single_gold_dev')
write_nested_file(a_token_gold_train_fix_sents, 'a_token-single_gold_train')
write_nested_file(a_token_gold_test_fix_sents,  'a_token-single_gold_test')

In [55]:
write_nested_file(b_token_gold_dev_fix_sents,   'b_token-single_gold_dev')
write_nested_file(b_token_gold_train_fix_sents, 'b_token-single_gold_train')
write_nested_file(b_token_gold_test_fix_sents,  'b_token-single_gold_test')