In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [4]:
import bclm

In [5]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl', subset='dev')

In [7]:
uddf = bclm.read_dataframe('ud', subset='dev')

In [8]:
from collections import OrderedDict

In [9]:
ud_sents = list(uddf.sent_id.unique())

## Matching sents (dev, train , test)

In [10]:
import re 
sent_id_re = re.compile('# sent_id = (\d+)')

import bz2
def get_sent_ids_lat(path):
    sent_ids = []
    with bz2.open(path, 'rt', encoding='utf-8') as f:
        for line in f:
            if line[0]=='#':
                sid = sent_id_re.match(line)
                if sid:
                    sent_id =  int(sid.group(1))
                sent_ids.append(sent_id)
    return sent_ids

dev_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-dev.heblex.conllul.bz2')
            

In [11]:
print(f'dev: {min(dev_sent_ids)}-{max(dev_sent_ids)}')

dev: 1-484


In [12]:
uddf.groupby(['set']).sent_id.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev,1,500


In [14]:
def get_set_from_sent_id(gsi):
    if gsi>=min(dev_sent_ids) and gsi<=max(dev_sent_ids):
        return 'dev'
    
uddf['ud_set'] = uddf.sent_id.apply(get_set_from_sent_id)

uddf.groupby(['ud_set']).sent_id.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
ud_set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev,1,484


In [15]:
import re
def get_biose(df):
    ner_bio = df.ner.copy()
    ner_bio[~ner_bio.str.contains('\[|_')] = 'S-' + ner_bio 
    ner_bio = ner_bio.replace('_', 'O')

    RE = '^(?P<type>[A-Z]+)\[(?P<num>\d+)\].*$'
    prev='XXX'
    new = []

    for x in ner_bio.tolist():
        if x.startswith('S') or x=='O':
            new.append(x)
        else:
            typ, num = re.match(RE, x).groups(0)
            if x==prev:
                new.append('I-'+typ)
            else:
                new.append('B-'+typ)

        prev = x
        
    # reverse pass to add Es
    prev='O'
    rev_pass = []
    for x in new[::-1]:
        if((prev=='O' or prev.startswith('B-')) and x.startswith('I-')):
            rev_pass.append(x.replace('I-', 'E-'))
        else:
            rev_pass.append(x)
        prev = x

    biose = rev_pass[::-1]
    
    return biose

In [19]:
def read_tsv3(path, add_biose=True):
    names = ['sent_tok_num', 'tok_offset', 'token',
              'FEAT_gender', 'FEAT_number', 'FEAT_case', 'FEAT_degree', 'FEAT_transitivity',
              'FEAT_tense', 'FEAT_mood', 'FEAT_voice', 'FEAT_definiteness', 'FEAT_value', 'FEAT_person', 'FEAT_aspect',
              'pos',
              'ner',
              'lemma',
              'surface_form',
              'dep_type', 'dep_flavor', 'dep_lex_morph_pos', 'dep_arc', 'EXTRA'
            ]
    df = (pd.read_csv(path, sep='\t', skiprows=10, header=None, names=names,
                      comment='#', skip_blank_lines=True, quoting=3)#na_values=['*', '_'])
          .assign(ner_type = lambda x: x.ner.str.split('[', expand=True).iloc[:,0])
          .assign(is_ner = lambda x: x.ner!='_')
          #.assign(biose = lambda x: get_biose(x))
         )
    return df


In [20]:
from ner_transforms import *


## fixes

In [21]:
df = read_tsv3('data/pilot_annotations/group1/anatb/dev_1-100.tsv')
df.head()

Unnamed: 0,sent_tok_num,tok_offset,token,FEAT_gender,FEAT_number,FEAT_case,FEAT_degree,FEAT_transitivity,FEAT_tense,FEAT_mood,...,ner,lemma,surface_form,dep_type,dep_flavor,dep_lex_morph_pos,dep_arc,EXTRA,ner_type,is_ner
0,1-1,0-5,עשרות,Fem,Plur,*,*,*,*,*,...,_,עשרות,_,nummod,basic,1-2,,,_,False
1,1-2,6-11,אנשים,Masc,Plur,*,*,*,*,*,...,_,איש,_,nsubj,basic,1-3,,,_,False
2,1-3,12-18,מגיעים,Masc,Plur,*,*,*,*,*,...,_,הגיע,_,root,basic,1-3,,,_,False
3,1-4,19-20,מ,_,_,_,_,_,_,_,...,_,מ,מתאילנד[65],case,basic,1-5,,,_,False
4,1-5,21-27,תאילנד,_,_,_,_,_,_,_,...,GPE,תאילנד,מתאילנד[65],obl,basic,1-3,,,GPE,True


## Read all sents for A,B

In [170]:
import os
from collections import defaultdict 
groups = {}
for folder in os.scandir('data/pilot_annotations/'):
    if '.ipynb' in folder.name:
        continue
    group = folder.name
    groups[group] = defaultdict(list)
    for tat_folder in os.scandir(folder):
        if '.ipynb' in tat_folder.name:
            continue
        tat = tat_folder.name
        for file in os.scandir(tat_folder):
            if '.ipynb' in file.name:
                continue
            print (file.path)
            first_sent_id = int(file.name.split('.')[0].split('_')[1].split('-')[0])
            df = read_tsv3(file.path)
            #apply_fixes(folder.name, df)
            df[['sent_id','id']] = df.sent_tok_num.str.split('-',expand=True).astype(int)

            df['sent_id'] = df.sent_id + first_sent_id - 1
            df['file'] = folder.name
            layers = get_all_layers_biose(df)
            df = df.merge(layers, how='left', on=['sent_id', 'id'])
            groups[group][tat].append(df)
        
for group in groups:
    for tat in groups[group]:
        groups[group][tat] = pd.concat(groups[group][tat], sort=False)
        groups[group][tat] = groups[group][tat].sort_values(by=['sent_id', 'id'])

data/pilot_annotations/group1/anatb/dev_1-100.tsv
data/pilot_annotations/group1/anatb/dev_101-200.tsv
data/pilot_annotations/group1/anatb/dev_201-300.tsv
data/pilot_annotations/group1/anatb/dev_301-400.tsv
data/pilot_annotations/group1/dafnaa/dev_1-100.tsv
data/pilot_annotations/group1/dafnaa/dev_101-200.tsv
data/pilot_annotations/group1/dafnaa/dev_201-300.tsv
data/pilot_annotations/group1/dafnaa/dev_301-400.tsv
data/pilot_annotations/group1/shayp/dev_1-100.tsv
data/pilot_annotations/group1/shayp/dev_101-200.tsv
data/pilot_annotations/group1/shayp/dev_201-300.tsv
data/pilot_annotations/group1/shayp/dev_301-400.tsv
data/pilot_annotations/group1/sinair/dev_1-100.tsv
data/pilot_annotations/group1/sinair/dev_101-200.tsv
data/pilot_annotations/group1/sinair/dev_201-300.tsv
data/pilot_annotations/group1/sinair/dev_301-400.tsv
data/pilot_annotations/group1/tzufa/dev_1-100.tsv
data/pilot_annotations/group1/tzufa/dev_101-200.tsv
data/pilot_annotations/group1/tzufa/dev_201-300.tsv
data/pilot_ann

In [171]:
groups['group1']['anatb'].tail().T

Unnamed: 0,2244,2245,2246,2247,2248
sent_tok_num,100-27,100-28,100-29,100-30,100-31
tok_offset,9555-9559,9560-9564,9565-9566,9567-9570,9571-9572
token,_של_,_הוא,ל,דין,.
FEAT_gender,_,Masc,_,Masc,_
FEAT_number,_,Sing,_,Sing,_
FEAT_case,_,Gen,_,*,_
FEAT_degree,_,*,_,*,_
FEAT_transitivity,_,*,_,*,_
FEAT_tense,_,*,_,*,_
FEAT_mood,_,*,_,*,_


## align with UD curation output files


In [172]:
for group in groups:
    for tat in groups[group]:
        print(group, tat, groups[group][tat].shape)

group1 anatb (9904, 31)
group1 dafnaa (9904, 30)
group1 shayp (9904, 32)
group1 sinair (9904, 31)
group1 tzufa (9904, 30)
group1 vikab (9904, 31)
group2 nuritg (9904, 30)
group2 shovals (9904, 30)
group2 tzipyl (9904, 32)
group2 vereds (9904, 31)
group2 yohayg (9904, 31)
group2 zefs (9904, 31)


In [173]:
uddf = uddf[uddf.sent_id<=400]
uddf.shape

(9908, 28)

In [174]:
sent_diff = groups['group1']['anatb'].groupby('sent_id').size()[groups['group1']['anatb'].groupby('sent_id').size()!=uddf.groupby('sent_id').size()]
sent_diff

sent_id
147     8
228    52
291     9
311    13
dtype: int64

In [175]:
for group, tators in groups.items():
    for tat, annos in tators.items():
        print(tat)
        #print(annos.loc[(annos.sent_id.isin(sent_diff.index)) & (annos.ner!='_'), ['sent_id', 'id', 'token', 'ner']])
        print(annos.loc[(annos.sent_id.isin(sent_diff.index)) & (annos.ner!='_'), :].sent_tok_num.unique())

anatb
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-10'
 '11-11' '11-12']
dafnaa
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '11-12']
shayp
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23']
sinair
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '11-12']
tzufa
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '11-12']
vikab
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '11-12']
nuritg
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '11-12']
shovals
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '11-12']
tzipyl
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '11-12']
vereds
['28-13' '28-14' '28-16' '28-17' '28-19' '28-20' '28-22' '28-23' '11-9'
 '11-10' '11-11' '

In [176]:
uddf[uddf.sent_id.isin(sent_diff.index[3:4])]

Unnamed: 0,sent_id,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,...,ner_layers,ner_escaped,duplicate_sent_id,very_similar_sent_id,biose_layer0,biose_layer1,biose_layer2,biose_layer3,set,ud_set
7983,311,1,הוא,הוא,PRON,PRON,"OrderedDict([('Gender', 'Masc'), ('Number', 'S...",4,nsubj,,...,0,_,,,O,O,O,O,dev,dev
7984,311,2,היה,היה,AUX,AUX,"OrderedDict([('Gender', 'Masc'), ('Number', 'S...",4,cop,,...,0,_,,,O,O,O,O,dev,dev
7985,311,3,אפילו,אפילו,ADV,ADV,,4,advmod,,...,0,_,,,O,O,O,O,dev,dev
7986,311,4,מסוגל,מסוגל,AUX,AUX,"OrderedDict([('Gender', 'Masc'), ('Number', 'S...",5,aux,,...,0,_,,,O,O,O,O,dev,dev
7987,311,5,לראותו,ראה,VERB,VERB,"OrderedDict([('HebBinyan', 'PAAL'), ('VerbForm...",0,root,,...,0,_,,,O,O,O,O,dev,dev
7988,311,6,את,את,ADP,ADP,"OrderedDict([('Case', 'Acc')])",7,case,,...,0,_,,,O,O,O,O,dev,dev
7989,311,7,_הוא,הוא,PRON,PRON,"OrderedDict([('Case', 'Acc'), ('Gender', 'Masc...",5,obj,,...,0,_,,,O,O,O,O,dev,dev
7990,311,8,מכהן,כיהן,VERB,VERB,"OrderedDict([('Gender', 'Masc'), ('HebBinyan',...",7,acl,,...,0,_,,,O,O,O,O,dev,dev
7991,311,9,ב,ב,ADP,ADP,,11,case,,...,0,_,,,O,O,O,O,dev,dev
7992,311,10,ה_,ה,DET,DET,"OrderedDict([('PronType', 'Art')])",11,det:def,,...,1,FAC[9],,,B-FAC,O,O,O,dev,dev


In [177]:
groups['group1']['anatb'][groups['group1']['anatb'].sent_id.isin(sent_diff.index[3:4])]

Unnamed: 0,sent_tok_num,tok_offset,token,FEAT_gender,FEAT_number,FEAT_case,FEAT_degree,FEAT_transitivity,FEAT_tense,FEAT_mood,...,dep_lex_morph_pos,dep_arc,EXTRA,ner_type,is_ner,sent_id,id,file,biose_layer0,biose_layer1
325,11-1,1403-1406,הוא,Masc,Sing,*,*,*,*,*,...,11-4,,,_,False,311,1,group1,O,O
326,11-2,1407-1410,היה,Masc,Sing,*,*,*,Past,*,...,11-4,,,_,False,311,2,group1,O,O
327,11-3,1411-1416,אפילו,_,_,_,_,_,_,_,...,11-4,,,_,False,311,3,group1,O,O
328,11-4,1417-1422,מסוגל,Masc,Sing,*,*,*,*,*,...,11-4,,,_,False,311,4,group1,O,O
329,11-5,1423-1429,לראותו,*,*,*,*,*,*,*,...,11-4,,,_,False,311,5,group1,O,O
330,11-6,1430-1434,_הוא,Masc,Sing,Acc,*,*,*,*,...,11-5,,,_,False,311,6,group1,O,O
331,11-7,1435-1439,מכהן,Masc,Sing,*,*,*,*,*,...,11-6,,,_,False,311,7,group1,O,O
332,11-8,1440-1441,ב,_,_,_,_,_,_,_,...,11-10,,,_,False,311,8,group1,O,O
333,11-9,1442-1444,ה_,*,*,*,*,*,*,*,...,11-10,,,_,False,311,9,group1,O,O
334,11-10,1445-1448,בית,Masc,Sing,*,*,*,*,*,...,11-7,,,ORG,True,311,10,group1,B-ORG,O


In [178]:
for group in groups:
    for tat in groups[group]:
         groups[group][tat] =  groups[group][tat].reset_index(drop=True)

In [179]:
uddf.columns

Index(['sent_id', 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head',
       'deprel', 'deps', 'misc', 'global_sent_id', 'token_id', 'token_str',
       'token_morph_id', 'binyan', 'biose', 'ner', 'ner_layers', 'ner_escaped',
       'duplicate_sent_id', 'very_similar_sent_id', 'biose_layer0',
       'biose_layer1', 'biose_layer2', 'biose_layer3', 'set', 'ud_set'],
      dtype='object')

In [180]:
uddf = uddf[uddf.sent_id<=400]
uddf.shape

(9908, 28)

In [181]:
keep_cols = ['sent_id', 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head',
       'deprel', 'deps', 'misc', 'global_sent_id', 'token_id', 'token_str',
       'token_morph_id', 'binyan', 
       'duplicate_sent_id', 'very_similar_sent_id', 'set', 'ud_set']
def merge_uddf(uddf, df):
    keep_cols = ['sent_id', 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head',
       'deprel', 'deps', 'misc', 'global_sent_id', 'token_id', 'token_str',
       'token_morph_id', 'binyan', 
       'duplicate_sent_id', 'very_similar_sent_id', 'set', 'ud_set']
    tat_cols = ['ner', 'ner_type', 'is_ner', 'biose_layer0']
    for i in range(4):
        if 'biose_layer'+str(i+1) in df.columns:
            tat_cols.append('biose_layer'+str(i+1))
    n = pd.concat([uddf[keep_cols], df[tat_cols]], axis=1)
    return n, keep_cols+tat_cols

In [182]:
uddf_tofix = uddf[(uddf.sent_id.isin(sent_diff.index))][keep_cols]

In [183]:
uddf_tofix.shape

(86, 20)

In [184]:
uddf_ok = uddf[~(uddf.sent_id.isin(sent_diff.index))].reset_index(drop=True)

In [186]:
merged = {}
for group, tators in groups.items():
    print(group)
    merged[group] = {}
    for tat, annos in tators.items():
        print(tat)
        annos_ok = annos[~(annos.sent_id.isin(sent_diff.index))].reset_index(drop=True)
        a, all_cols = merge_uddf(uddf_ok, annos_ok)
        biose_cols = [c for c in all_cols if 'biose_layer' in c]
        a = a[all_cols]
        annos_tofix = annos[(annos.sent_id.isin(sent_diff.index)) 
                         & (annos.ner!='_')][['sent_id', 'token', 'ner', 'ner_type', 'is_ner'] +biose_cols].rename(columns={'token': 'form'})
        a_missing = uddf_tofix.merge(annos_tofix, how='left', on=['sent_id', 'form'])[all_cols]
        a = pd.concat([a, a_missing])
        a = a.sort_values(['sent_id', 'id']).reset_index(drop=True)
        a = a[a.sent_id.notna()]
        a[['ner', 'ner_type']] = a[['ner', 'ner_type']].fillna('_')
        a['is_ner'] = a['is_ner'].fillna(False)
        for i in range(4):
            if 'biose_layer'+str(i) in a.columns:
                a['biose_layer'+str(i)] = a['biose_layer'+str(i)].fillna('O')
            else:
                a['biose_layer'+str(i)] = 'O'
        merged[group][tat] = a
        

group1
anatb
dafnaa
shayp
sinair
tzufa
vikab
group2
nuritg
shovals
tzipyl
vereds
yohayg
zefs


In [187]:
merged['group1']['anatb'].shape

(9908, 27)

## Create variant files for each

In [189]:
aliases = {'dafnaa': 'a', 
           'zefs': 'b'}
i = 0
for group in groups:
    for tat in groups[group]:
        if tat not in aliases:
            aliases[tat] = chr(ord('c')+i)
            i+=1
aliases

{'dafnaa': 'a',
 'zefs': 'b',
 'anatb': 'c',
 'shayp': 'd',
 'sinair': 'e',
 'tzufa': 'f',
 'vikab': 'g',
 'nuritg': 'h',
 'shovals': 'i',
 'tzipyl': 'j',
 'vereds': 'k',
 'yohayg': 'l'}

In [190]:
import os
out_folder = 'data/ud_ner/pilot_annotations/'

def write_ncrf_file(sents, file_name, keep_pos=False, bioul=False, dummy_o=False):
    if keep_pos:
        file_name += '_pos'
    if dummy_o:
        file_name += '_dummy_o'
    if bioul:
        suffix = '.bioul'
    else:
        suffix = '.bmes'
    with open(os.path.join(out_folder, file_name+suffix), 'w', encoding='utf8') as of:
        for sent in sents:
            for fields in sent: 
                new_pos = '[POS]'+fields[1]
                line = fields[0]
                if keep_pos:
                    line += ' ' + new_pos
                if len(fields)==3:
                    tag = fields[2]
                    if bioul:
                        tag = tag.replace('E-', 'L-').replace('S-', 'U-')
                    if dummy_o:
                        tag = 'O'
                    line += ' ' + tag
                of.write(line+'\n')
            of.write('\n')
                    
        

In [191]:
import os
nested_out_folder = 'data/ud_ner/pilot_annotations/nested'

def write_nested_file(sents, file_name):
    suffix = '.bmes'
    with open(os.path.join(nested_out_folder, file_name+suffix), 'w', encoding='utf8') as of:
        for sent in sents:
            for fields in sent: 
                line = ' '.join(fields)
                of.write(line+'\n')
            of.write('\n')
            

In [192]:
1==1

True

In [193]:
for group in merged:
    for name, a in merged[group].items():
        alias = aliases[name]
        
        #morph
        sents = bclm.get_sentences_list(a, fields=['form', 'upostag', 'biose_layer0'])
        write_ncrf_file(sents, f'{group}_{alias}_morph_dev_1-400')
        
        #token-multi
        a_biose_concat = bclm.get_token_df(a, fields = ['biose_layer0', 'upostag'])
        sents = bclm.get_sentences_list(a_biose_concat  , fields=['token_str', 'upostag', 'biose_layer0'])
        write_ncrf_file(sents, f'{group}_{alias}_token-multi_dev_1-400')

        #token-single
        a_biose_fix = bclm.get_token_df(a, fields = ['upostag'], biose=['biose_layer0'])
        sents = bclm.get_sentences_list(a_biose_fix  , fields=['token_str', 'upostag', 'biose_layer0'])
        write_ncrf_file(sents, f'{group}_{alias}_token-single_dev_1-400')
        
        # === nested ===
        #morph
        fields = ['form', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
        sents =   bclm.get_sentences_list(a, fields=fields)
        write_nested_file(sents, f'{group}_{alias}_morph_dev_1-400')
        
        #token-multi
        fields = ['token_str', 'biose_layer0', 'biose_layer1', 'biose_layer2', 'biose_layer3']
        a_biose_concat = bclm.get_token_df(a, fields = fields[1:])
        sents = bclm.get_sentences_list(a_biose_concat  , fields=fields)
        write_nested_file(sents, f'{group}_{alias}_token-multi_dev_1-400')
        
        #token-single
        a_biose_fix = bclm.get_token_df(a, fields = ['upostag'], biose=fields[1:])
        sents = bclm.get_sentences_list(a_biose_fix  , fields=fields)
        write_nested_file(sents, f'{group}_{alias}_token-single_dev_1-400')
