In [1]:
import pandas as pd
from conllu import parse


In [2]:
from collections import OrderedDict

def get_conllu_df(path, remove_duplicates=False, remove_very_similar=False):
    with open(path, 'r', encoding='utf8') as f:
        sp_conllu = parse(f.read())
    fixed = []
    dup_to_remove = set()
    very_sim_to_remove = set()
    for tl in sp_conllu:
        if (remove_duplicates and int(tl.metadata['sent_id']) in dup_to_remove 
            or remove_very_similar and int(tl.metadata['sent_id']) in very_sim_to_remove):
            print ('skipped', tl.metadata['sent_id'])
            continue
        for tok in tl:
            t = OrderedDict(tok)
            if type(t['id']) is not tuple:
                if t['feats'] is not None:
                    t.update({'feats_'+f: v for f, v in t['feats'].items()})
                del(t['feats'])
                if t['misc'] is not None:
                    t.update({'misc_'+f: v for f, v in t['misc'].items()})
                del(t['misc'])
                t.update(tl.metadata)
                fixed.append(t)
            if remove_duplicates:
                dup_to_remove = dup_to_remove | set(eval(tl.metadata['duplicate_sent_id']))
            if remove_very_similar:
                very_sim_to_remove = dup_to_remove | set(eval(tl.metadata['very_similar_sent_id']))

    df = (pd.DataFrame(fixed)
          .assign(sent_id = lambda x: x.sent_id.astype(int))
          .assign(global_sent_id = lambda x: x.global_sent_id.astype(int))
          .assign(misc_token_id = lambda x: x.misc_token_id.astype(int))

         )
    return df

        
spdf = get_conllu_df('align/spmrl_fixed.conllu', remove_duplicates=True)
    

skipped 5438
skipped 5444
skipped 5445
skipped 5446
skipped 5448
skipped 5449
skipped 5450
skipped 5451
skipped 5453
skipped 5459


In [3]:
spdf[['biose_only', 'ner_type']] = spdf.misc_biose.str.split('-', expand=True)

In [4]:
spdf.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
form,עשרות,אנשים,מגיעים,מ,תאילנד
lemma,עשר,איש,הגיע,מ,תאילנד
upostag,CDT,NN,BN,PREPOSITION,NNP
xpostag,CDT,NN,BN,PREPOSITION,NNP
head,2,3,0,3,4
deprel,num,subj,ROOT,prepmod,pobj
deps,,,,,
feats_gen,F,M,M,,
feats_num,P,P,P,,


In [5]:
spdf.groupby('set').sent_id.nunique()

set
dev       500
test      706
train    4937
Name: sent_id, dtype: int64

In [6]:
spdf.dtypes

id                       int64
form                    object
lemma                   object
upostag                 object
xpostag                 object
head                     int64
deprel                  object
deps                    object
feats_gen               object
feats_num               object
misc_biose              object
misc_ner_escaped        object
misc_token_id            int64
misc_token_str          object
sent_id                  int64
global_sent_id           int64
text_from_ud            object
very_similar_sent_id    object
duplicate_sent_id       object
tokens                  object
set                     object
feats_per               object
feats_HebBinyan         object
feats_tense             object
feats_suf_gen           object
feats_suf_num           object
feats_suf_per           object
misc_MISC               object
feats_polar             object
biose_only              object
ner_type                object
dtype: object

In [7]:
spdf.ner_type.value_counts()

ORG    6370
PER    3937
GPE    2248
LOC     910
FAC     503
WOA     405
EVE     209
DUC      69
ANG      43
Name: ner_type, dtype: int64

In [8]:
(spdf
          .groupby(['sent_id', 'misc_token_id', 'misc_token_str']).apply(lambda x: ','.join(x.biose_only.tolist())))

sent_id  misc_token_id  misc_token_str
1        1              עשרות                 O
         2              אנשים                 O
         3              מגיעים                O
         4              מתאילנד             O,S
         5              לישראל              O,S
         6              כשהם                O,O
         7              נרשמים                O
         8              כמתנדבים            O,O
         9              ,                     O
         10             אך                    O
         11             למעשה                 O
         12             משמשים                O
         13             עובדים                O
         14             שכירים                O
         15             זולים                 O
         16             .                     O
2        1              תופעה                 O
         2              זו                    O
         3              התבררה                O
         4              אתמול                 O
 

In [9]:
def get_token_biose(df):
    def _single_token_conversion(tok):
        all_bio = tok.biose_only.tolist()
        all_typ = set(tok.ner_type.dropna().tolist())
        if len(all_typ)>1:
            return 'O'
        if 'S' in all_bio:
            new_bio = 'S'
        elif 'B' in all_bio and 'E' in all_bio:
            new_bio = 'S'
        elif 'B' in all_bio:
            new_bio = 'B'
        elif 'E' in all_bio:
            new_bio = 'E'
        elif 'I' in all_bio:
            new_bio = 'I'
        else:
            return 'O'
        return new_bio+'-'+all_typ.pop()
    
    df = (spdf
          .groupby(['sent_id', 'misc_token_id', 'misc_token_str'])
          .apply(_single_token_conversion)
          .reset_index().rename(columns={0:'biose'})
         )
    return df
    
tok_ner = get_token_biose(spdf)

In [10]:
tok_ner.head()

Unnamed: 0,sent_id,misc_token_id,misc_token_str,biose
0,1,1,עשרות,O
1,1,2,אנשים,O
2,1,3,מגיעים,O
3,1,4,מתאילנד,S-GPE
4,1,5,לישראל,S-GPE


In [11]:
tok_ner.head(30)

Unnamed: 0,sent_id,misc_token_id,misc_token_str,biose
0,1,1,עשרות,O
1,1,2,אנשים,O
2,1,3,מגיעים,O
3,1,4,מתאילנד,S-GPE
4,1,5,לישראל,S-GPE
5,1,6,כשהם,O
6,1,7,נרשמים,O
7,1,8,כמתנדבים,O
8,1,9,",",O
9,1,10,אך,O


In [12]:
spdf.columns[spdf.columns.str.startswith('feats')]

Index(['feats_gen', 'feats_num', 'feats_per', 'feats_HebBinyan', 'feats_tense',
       'feats_suf_gen', 'feats_suf_num', 'feats_suf_per', 'feats_polar'],
      dtype='object')

In [13]:
from tqdm import tqdm
# Create and register a new `tqdm` instance with `pandas`
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [14]:
#**{'upostag': ','.join(x.upostag.tolist())}
features = (spdf
            .groupby(['sent_id', 'misc_token_id', 'misc_token_str'])
            .progress_apply(lambda x: pd.Series({f: ','.join(x[f].fillna('').tolist()) for f in spdf.columns[spdf.columns.str.startswith('feats')]}
            ))
           )

100%|██████████| 114654/114654 [07:48<00:00, 244.90it/s]


In [15]:
features.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,feats_gen,feats_num,feats_per,feats_HebBinyan,feats_tense,feats_suf_gen,feats_suf_num,feats_suf_per,feats_polar
sent_id,misc_token_id,misc_token_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,עשרות,F,P,,,,,,,
1,2,אנשים,M,P,,,,,,,
1,3,מגיעים,M,P,A,HIFIL,,,,,
1,4,מתאילנד,",",",",",",",",",",",",",",",",","
1,5,לישראל,",",",",",",",",",",",",",",",",","
1,6,כשהם,",M",",P",",3",",",",",",",",",",",","
1,7,נרשמים,M,P,A,NIFAL,,,,,
1,8,כמתנדבים,",M",",P",",",",",",",",",",",",",","
1,9,",",,,,,,,,,
1,10,אך,,,,,,,,,


In [16]:
upos_form_lemma_deprel = (spdf
            .groupby(['sent_id', 'misc_token_id', 'misc_token_str'])
            .progress_apply(lambda x: pd.Series({'upostag': ','.join(x.upostag.tolist()),
                                                 'form': ','.join(x.form.tolist()),
                                                 'lemma': ','.join(x.lemma.tolist()),
                                                 'deprel': ','.join(x.deprel.tolist()),}
            ))
           )

100%|██████████| 114654/114654 [01:24<00:00, 1350.86it/s]


In [17]:
tokens_ner_with_upos_feats = pd.concat([tok_ner.set_index(['sent_id', 'misc_token_id', 'misc_token_str']), upos_form_lemma_deprel, features], axis=1)
tokens_ner_with_upos_feats = (tokens_ner_with_upos_feats
                              .reset_index()
                              .set_index(['sent_id', 'misc_token_id', 'misc_token_str'])
                              .sort_index()
                              .assign(set = lambda x: x.index.get_level_values('sent_id').map(spdf[['sent_id', 'set']].drop_duplicates().set_index('sent_id')['set']))
                             )
tokens_ner_with_upos_feats.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,biose,upostag,form,lemma,deprel,feats_gen,feats_num,feats_per,feats_HebBinyan,feats_tense,feats_suf_gen,feats_suf_num,feats_suf_per,feats_polar,set
sent_id,misc_token_id,misc_token_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,עשרות,O,CDT,עשרות,עשר,num,F,P,,,,,,,,dev
1,2,אנשים,O,NN,אנשים,איש,subj,M,P,,,,,,,,dev
1,3,מגיעים,O,BN,מגיעים,הגיע,ROOT,M,P,A,HIFIL,,,,,,dev
1,4,מתאילנד,S-GPE,"PREPOSITION,NNP","מ,תאילנד","מ,תאילנד","prepmod,pobj",",",",",",",",",",",",",",",",",",",dev
1,5,לישראל,S-GPE,"PREPOSITION,NNP","ל,ישראל","ל,ישראל","prepmod,pobj",",",",",",",",",",",",",",",",",",",dev


In [18]:
tokens_ner_with_upos_feats.deprel.value_counts()

punct                      17683
prepmod,pobj                8542
gobj                        6106
subj                        5416
conj                        5159
ROOT                        5037
prepmod                     4245
def,gobj                    4235
amod                        3912
advmod                      3658
pobj                        3600
nn                          2873
def,amod                    2859
obj                         2827
prepmod,def,pobj            2586
def,subj                    1935
rcmod,relcomp               1856
xcomp                       1664
acc                         1604
posspmod                    1432
def,pobj                    1242
ccomp                       1182
hd                          1128
comp                        1100
neg                         1053
det                         1024
dep                          915
cop                          821
appos                        807
relcomp                      792
          

In [19]:
tn = tokens_ner_with_upos_feats

In [20]:
import os
yap_output_dir = 'new_yap_outputs'

os.mkdir(yap_output_dir)

FileExistsError: [Errno 17] File exists: 'new_yap_outputs'

In [21]:
def get_token_output(s):
    return '\n'.join(s.misc_token_str.tolist())+'\n\n'

def write_tokens_to_file(tok_df, corp_set, output_path):
    with open(output_path, 'w') as of:
        for sent in tok_df[tok_df.set==corp_set].reset_index().groupby('sent_id').apply(get_token_output):
            of.write(sent)


write_tokens_to_file(tn, 'dev', os.path.join(yap_output_dir, 'spmrl_dev_tokens.txt'))
write_tokens_to_file(tn, 'test', os.path.join(yap_output_dir, 'spmrl_test_tokens.txt'))
write_tokens_to_file(tn, 'train', os.path.join(yap_output_dir, 'spmrl_train_tokens.txt'))

In [22]:
yap_path = '/home/dan/yapproj/src/yap/yap'

In [23]:
!export GOPATH=/home/dan/yapproj

In [24]:
!{yap_path}

/home/dan/yapproj/src/yap/yap - invoke yap as a standalone app or as an api server

Commands:

    api         start api server
    dep         runs dependency training/parsing
    hebma       run lexicon-based morphological analyzer on raw input
    joint       runs joint morpho-syntactic training and parsing
    ma          run data-driven morphological analyzer on raw input
    md          runs standalone morphological disambiguation training and parsing

Use "/home/dan/yapproj/src/yap/yap help <command>" for more information about a command.



In [25]:
!{yap_path} hebma -raw new_yap_outputs/spmrl_dev_tokens.txt -out new_yap_outputs/spmrl_dev.lattices

2019/07/16 15:36:33.927689 GOMAXPROCS:	40
2019/07/16 15:36:33.927743 
2019/07/16 15:36:33.927787 Configuration
2019/07/16 15:36:33.927795 Heb Lexicon:		/home/dan/yapproj/src/yap/data/bgulex/bgupreflex_withdef.utf8.hr
2019/07/16 15:36:33.927800 Heb Prefix:		/home/dan/yapproj/src/yap/data/bgulex/bgulex.utf8.hr
2019/07/16 15:36:33.927803 OOV Strategy:	Const:NNP
2019/07/16 15:36:33.927807 xliter8 out:		false
2019/07/16 15:36:33.927810 
2019/07/16 15:36:33.927813 Raw Input:		new_yap_outputs/spmrl_dev_tokens.txt
2019/07/16 15:36:33.927817 Output:		new_yap_outputs/spmrl_dev.lattices
2019/07/16 15:36:33.927821 
2019/07/16 15:36:33.927825 Reading Morphological Analyzer BGU Prefixes
2019/07/16 15:36:33.928370 Found 102 tokens in lexicon file: /home/dan/yapproj/src/yap/data/bgulex/bgupreflex_withdef.utf8.hr
2019/07/16 15:36:33.928398 Loaded 101 prefixes from lexicon
2019/07/16 15:36:33.928401 Reading Morphological Analyzer BGU Lexicon
2019/07/16 15:36:37.798498 Found 514852 tokens in lexicon file

In [26]:
!{yap_path} joint -in new_yap_outputs/spmrl_dev.lattices -os new_yap_outputs/spmrl_dev_seg.conll -om new_yap_outputs/spmrl_dev_map.conll -oc new_yap_outputs/spmrl_dev_dep.conll

2019/07/16 15:36:38.514859 GOMAXPROCS:	40
2019/07/16 15:36:38.514912 
2019/07/16 15:36:38.514968 *** CONFIGURATION ***
2019/07/16 15:36:38.514978 Beam:             	Standard Beam [Not Aligned & Not Averaged]
2019/07/16 15:36:38.515001 Transition System:	Joint Morpho-Syntactic [MD:Morpheme-Based Morphological Disambiguator, ArcSys:Arc Zeager (zpar acl '11) [a.k.a. ArcZEager]] - Strategy: ArcGreedy
2019/07/16 15:36:38.515012 Transition Oracle:	Joint Morpho-Syntactic - Strategy: ArcGreedy
2019/07/16 15:36:38.515018 Iterations:		1
2019/07/16 15:36:38.515022 Beam Size:		64
2019/07/16 15:36:38.515026 Beam Concurrent:	true
2019/07/16 15:36:38.515029 Parameter Func:	Funcs_Main_POS_Both_Prop
2019/07/16 15:36:38.515033 Use Lemmas:		false
2019/07/16 15:36:38.515037 Use POP:		true
2019/07/16 15:36:38.515040 Infuse Gold Dev:	false
2019/07/16 15:36:38.515044 Limit (thousands):	0
2019/07/16 15:36:38.515047 Use CoNLL-U:		false
2019/07/16 15:36:38.515051 
2019/07/16 15:36:38.515054 Features File:	joint

In [27]:
!{yap_path} hebma -raw new_yap_outputs/spmrl_test_tokens.txt -out new_yap_outputs/spmrl_test.lattices

2019/07/16 15:38:34.816859 GOMAXPROCS:	40
2019/07/16 15:38:34.816936 
2019/07/16 15:38:34.816994 Configuration
2019/07/16 15:38:34.817002 Heb Lexicon:		/home/dan/yapproj/src/yap/data/bgulex/bgupreflex_withdef.utf8.hr
2019/07/16 15:38:34.817010 Heb Prefix:		/home/dan/yapproj/src/yap/data/bgulex/bgulex.utf8.hr
2019/07/16 15:38:34.817017 OOV Strategy:	Const:NNP
2019/07/16 15:38:34.817023 xliter8 out:		false
2019/07/16 15:38:34.817029 
2019/07/16 15:38:34.817035 Raw Input:		new_yap_outputs/spmrl_test_tokens.txt
2019/07/16 15:38:34.817042 Output:		new_yap_outputs/spmrl_test.lattices
2019/07/16 15:38:34.817048 
2019/07/16 15:38:34.817054 Reading Morphological Analyzer BGU Prefixes
2019/07/16 15:38:34.818071 Found 102 tokens in lexicon file: /home/dan/yapproj/src/yap/data/bgulex/bgupreflex_withdef.utf8.hr
2019/07/16 15:38:34.818128 Loaded 101 prefixes from lexicon
2019/07/16 15:38:34.818137 Reading Morphological Analyzer BGU Lexicon
2019/07/16 15:38:39.103918 Found 514852 tokens in lexicon fi

In [28]:
!{yap_path} joint -in new_yap_outputs/spmrl_test.lattices -os new_yap_outputs/spmrl_test_seg.conll -om new_yap_outputs/spmrl_test_map.conll -oc new_yap_outputs/spmrl_test_dep.conll

2019/07/16 15:38:39.994705 GOMAXPROCS:	40
2019/07/16 15:38:39.994751 
2019/07/16 15:38:39.994776 *** CONFIGURATION ***
2019/07/16 15:38:39.994784 Beam:             	Standard Beam [Not Aligned & Not Averaged]
2019/07/16 15:38:39.994799 Transition System:	Joint Morpho-Syntactic [MD:Morpheme-Based Morphological Disambiguator, ArcSys:Arc Zeager (zpar acl '11) [a.k.a. ArcZEager]] - Strategy: ArcGreedy
2019/07/16 15:38:39.994806 Transition Oracle:	Joint Morpho-Syntactic - Strategy: ArcGreedy
2019/07/16 15:38:39.994811 Iterations:		1
2019/07/16 15:38:39.994817 Beam Size:		64
2019/07/16 15:38:39.994822 Beam Concurrent:	true
2019/07/16 15:38:39.994827 Parameter Func:	Funcs_Main_POS_Both_Prop
2019/07/16 15:38:39.994832 Use Lemmas:		false
2019/07/16 15:38:39.994838 Use POP:		true
2019/07/16 15:38:39.994843 Infuse Gold Dev:	false
2019/07/16 15:38:39.994848 Limit (thousands):	0
2019/07/16 15:38:39.994852 Use CoNLL-U:		false
2019/07/16 15:38:39.994857 
2019/07/16 15:38:39.994861 Features File:	joint

In [29]:
tn[tn.set=='test'].reset_index().sent_id.unique()

array([5439, 5440, 5441, 5442, 5443, 5447, 5452, 5454, 5455, 5456, 5457,
       5458, 5460, 5461, 5462, 5463, 5464, 5465, 5466, 5467, 5468, 5469,
       5470, 5471, 5472, 5473, 5474, 5475, 5476, 5477, 5478, 5479, 5480,
       5481, 5482, 5483, 5484, 5485, 5486, 5487, 5488, 5489, 5490, 5491,
       5492, 5493, 5494, 5495, 5496, 5497, 5498, 5499, 5500, 5501, 5502,
       5503, 5504, 5505, 5506, 5507, 5508, 5509, 5510, 5511, 5512, 5513,
       5514, 5515, 5516, 5517, 5518, 5519, 5520, 5521, 5522, 5523, 5524,
       5525, 5526, 5527, 5528, 5529, 5530, 5531, 5532, 5533, 5534, 5535,
       5536, 5537, 5538, 5539, 5540, 5541, 5542, 5543, 5544, 5545, 5546,
       5547, 5548, 5549, 5550, 5551, 5552, 5553, 5554, 5555, 5556, 5557,
       5558, 5559, 5560, 5561, 5562, 5563, 5564, 5565, 5566, 5567, 5568,
       5569, 5570, 5571, 5572, 5573, 5574, 5575, 5576, 5577, 5578, 5579,
       5580, 5581, 5582, 5583, 5584, 5585, 5586, 5587, 5588, 5589, 5590,
       5591, 5592, 5593, 5594, 5595, 5596, 5597, 55

In [60]:
def make_conll_df(path, add_head_stuff=False):
    # CoNLL file is tab delimeted with no quoting
    # quoting=3 is csv.QUOTE_NONE
    df = (pd.read_csv(path, sep='\t', header=None, quoting=3, comment='#',
                names = ['id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'])
                # add sentence labels
                .assign(sent = lambda x: (x.id==1).cumsum())
                # replace bad root dependency tags
                .replace({'DEPREL': {'prd': 'ROOT'}})
               )
    
    if add_head_stuff:
        df = df.merge(df[['ID', 'FORM', 'sent', 'UPOS']].rename(index=str, columns={'FORM': 'head_form', 'UPOS': 'head_upos'}).set_index(['sent', 'ID']),
               left_on=['sent', 'HEAD'], right_index=True, how='left')
    return df

def read_lattices(path):
    df = (pd.read_csv(path, sep='\t', header=None, quoting=3, 
                names = ['ID1', 'ID2', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'token_id'])
                # add sentence labels
                .assign(sent = lambda x: (x.ID1==0).cumsum())
               )
    return df

flatten = lambda l: [item for sublist in l for item in sublist]

def get_yap_output_df(tokens_path, dep_path, map_path):
    tokens = dict(flatten([[(str(j+1)+'_'+str(i+1), tok) for i, tok in enumerate(sent.split('\n'))]
              for j, sent in 
              enumerate(open(os.path.join(yap_output_dir, tokens_path), 'r').read().split('\n\n'))]))
    lattices = read_lattices(map_path)
    dep = make_conll_df(dep_path)
    df = (pd.concat([dep, lattices.token_id], axis=1)
          .assign(sent_tok = lambda x: x.sent.astype(str) + '_' + x.token_id.astype(str))
          .assign(token_str = lambda x: x.sent_tok.map(tokens))
          .drop('sent_tok', axis=1))
    return df


yap_dev = get_yap_output_df('spmrl_dev_tokens.txt', 'new_yap_outputs/spmrl_dev_dep.conll', 'new_yap_outputs/spmrl_dev_map.conll')
yap_test = get_yap_output_df('spmrl_test_tokens.txt', 'new_yap_outputs/spmrl_test_dep.conll', 'new_yap_outputs/spmrl_test_map.conll')


In [62]:
yap_dev_tok = (yap_dev
            .groupby(['sent', 'token_id', 'token_str'])
            .apply(lambda x: pd.Series({ 'upostag': ','.join(x.upostag.fillna('').tolist()),
                                         'form': ','.join(x.form.fillna('').tolist()),
                                         'lemma': ','.join(x.lemma.fillna('').tolist()),
                                         'feats': ','.join(x.feats.fillna('').tolist()),
                                         'deprel': ','.join(x.deprel.fillna('').tolist()),}
            ))
           )

In [64]:
yap_test_tok = (yap_test
            .groupby(['sent', 'token_id', 'token_str'])
            .apply(lambda x: pd.Series({ 'upostag': ','.join(x.upostag.fillna('').tolist()),
                                         'form': ','.join(x.form.fillna('').tolist()),
                                         'lemma': ','.join(x.lemma.fillna('').tolist()),
                                         'feats': ','.join(x.feats.fillna('').tolist()),
                                         'deprel': ','.join(x.deprel.fillna('').tolist()),}
            ))
           )

In [53]:
import numpy as np

In [73]:
dev_sent_id_map = tn[tn.set=='dev'].reset_index().sent_id.drop_duplicates().reset_index().drop('index', axis=1)['sent_id']
dev_sent_id_map.index = dev_sent_id_map.index+1
test_sent_id_map = tn[tn.set=='test'].reset_index().sent_id.drop_duplicates().reset_index().drop('index', axis=1)['sent_id']
test_sent_id_map.index = test_sent_id_map.index+1
yap_dev_tok = (yap_dev_tok
               .reset_index()
               .assign(sent_id = lambda x: x['sent'].map(dev_sent_id_map))
               .set_index(['sent', 'token_id', 'token_str']))
yap_test_tok = (yap_test_tok.reset_index()
               .assign(sent_id = lambda x: x.sent.map(test_sent_id_map))
               .set_index(['sent', 'token_id', 'token_str']))

In [74]:
yap_dev_tok

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,upostag,form,lemma,feats,deprel,sent_id
sent,token_id,token_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,עשרות,CDT,עשרות,עשר,gen=F|num=P,num,1
1,2,אנשים,NN,אנשים,איש,gen=M|num=P,subj,1
1,3,מגיעים,BN,מגיעים,הגיע,gen=M|num=P|per=A,conj,1
1,4,מתאילנד,"PREPOSITION,NNP","מ,תאילנד","מ,תאילנד",",gen=F|num=S","comp,pobj",1
1,5,לישראל,NNP,לישראל,לישראל,gen=F|gen=M|num=S,nn,1
1,6,כשהם,"TEMP,PRP","כש,הם","כש,הוא",",gen=M|num=P|per=3","ccomp,subj",1
1,7,נרשמים,BN,נרשמים,נרשם,gen=M|num=P|per=A,ccomp,1
1,8,כמתנדבים,"PREPOSITION,DEF,NN","כ,ה,מתנדבים","כ,ה,מתנדב",",,gen=M|num=P","prepmod,def,pobj",1
1,9,",",yyCM,",",,,punct,1
1,10,אך,CC,אך,אך,,ROOT,1


In [75]:
spdf_fixed = pd.read_csv('align/spdf_fixed.csv.gz', compression='gzip')
spdf_fixed.feats.value_counts()

_                                                             84836
gen=M|num=S                                                   20013
gen=F|num=S                                                   13239
gen=M|num=P                                                    7805
gen=F|num=P                                                    3754
gen=M|num=S|per=3                                              2498
gen=F|gen=M|num=S                                              1719
gen=F|num=S|per=3                                              1384
gen=M|num=P|per=3                                               934
tense=TOINFINITIVE|HebBinyan=PAAL                               801
gen=M|num=S|per=3|tense=PAST|HebBinyan=PAAL                     777
gen=M|num=S|per=A|HebBinyan=PAAL                                734
tense=TOINFINITIVE|HebBinyan=HIFIL                              675
tense=TOINFINITIVE|HebBinyan=PIEL                               576
gen=M|num=S|suf_gen=M|suf_num=S|suf_per=3       

In [76]:
uddf_fixed = pd.read_csv('align/uddf_fixed.csv.gz', compression='gzip')
uddf_fixed.feats.value_counts()

OrderedDict([('PronType', 'Art')])                                                                                                                                               16431
OrderedDict([('Gender', 'Masc'), ('Number', 'Sing')])                                                                                                                            14701
OrderedDict([('Gender', 'Fem'), ('Number', 'Sing')])                                                                                                                              9957
OrderedDict([('Gender', 'Masc'), ('Number', 'Plur')])                                                                                                                             5736
OrderedDict([('Definite', 'Cons'), ('Gender', 'Masc'), ('Number', 'Sing')])                                                                                                       4051
OrderedDict([('Gender', 'Fem'), ('Number', 'Plur')])                                 