In [1]:
import pandas as pd
from conllu import parse


In [2]:
from collections import OrderedDict

def get_conllu_df(path, remove_duplicates=False, remove_very_similar=False):
    with open(path, 'r', encoding='utf8') as f:
        sp_conllu = parse(f.read())
    fixed = []
    dup_to_remove = set()
    very_sim_to_remove = set()
    for tl in sp_conllu:
        if (remove_duplicates and int(tl.metadata['sent_id']) in dup_to_remove 
            or remove_very_similar and int(tl.metadata['sent_id']) in very_sim_to_remove):
            print ('skipped', tl.metadata['sent_id'])
            continue
        for tok in tl:
            t = OrderedDict(tok)
            if type(t['id']) is not tuple:
                if t['feats'] is not None:
                    t.update({'feats_'+f: v for f, v in t['feats'].items()})
                del(t['feats'])
                if t['misc'] is not None:
                    t.update({'misc_'+f: v for f, v in t['misc'].items()})
                del(t['misc'])
                t.update(tl.metadata)
                fixed.append(t)
            if remove_duplicates:
                dup_to_remove = dup_to_remove | set(eval(tl.metadata['duplicate_sent_id']))
            if remove_very_similar:
                very_sim_to_remove = dup_to_remove | set(eval(tl.metadata['very_similar_sent_id']))

    df = (pd.DataFrame(fixed)
          .assign(sent_id = lambda x: x.sent_id.astype(int))
          .assign(global_sent_id = lambda x: x.global_sent_id.astype(int))
          .assign(misc_token_id = lambda x: x.misc_token_id.astype(int))

         )
    return df

        
spdf = get_conllu_df('align/spmrl_fixed.conllu', remove_duplicates=True)
    

skipped 5438
skipped 5444
skipped 5445
skipped 5446
skipped 5448
skipped 5449
skipped 5450
skipped 5451
skipped 5453
skipped 5459


In [3]:
spdf[['biose_only', 'ner_type']] = spdf.misc_biose.str.split('-', expand=True)

In [4]:
spdf.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
form,עשרות,אנשים,מגיעים,מ,תאילנד
lemma,עשר,איש,הגיע,מ,תאילנד
upostag,CDT,NN,BN,PREPOSITION,NNP
xpostag,CDT,NN,BN,PREPOSITION,NNP
head,2,3,0,3,4
deprel,num,subj,ROOT,prepmod,pobj
deps,,,,,
feats_gen,F,M,M,,
feats_num,P,P,P,,


In [5]:
spdf.groupby('set').sent_id.nunique()

set
dev       500
test      706
train    4937
Name: sent_id, dtype: int64

In [6]:
spdf.dtypes

id                       int64
form                    object
lemma                   object
upostag                 object
xpostag                 object
head                     int64
deprel                  object
deps                    object
feats_gen               object
feats_num               object
misc_biose              object
misc_ner_escaped        object
misc_token_id            int64
misc_token_str          object
sent_id                  int64
global_sent_id           int64
text_from_ud            object
very_similar_sent_id    object
duplicate_sent_id       object
set                     object
feats_per               object
feats_HebBinyan         object
feats_tense             object
feats_suf_gen           object
feats_suf_num           object
feats_suf_per           object
misc_MISC               object
feats_polar             object
biose_only              object
ner_type                object
dtype: object

In [7]:
spdf.ner_type.value_counts()

ORG    6370
PER    3937
GPE    2248
LOC     910
FAC     503
WOA     405
EVE     209
DUC      69
ANG      43
Name: ner_type, dtype: int64

In [8]:
(spdf
          .groupby(['sent_id', 'misc_token_id', 'misc_token_str']).apply(lambda x: ','.join(x.biose_only.tolist())))

sent_id  misc_token_id  misc_token_str
1        1              עשרות                 O
         2              אנשים                 O
         3              מגיעים                O
         4              מתאילנד             O,S
         5              לישראל              O,S
         6              כשהם                O,O
         7              נרשמים                O
         8              כמתנדבים            O,O
         9              ,                     O
         10             אך                    O
         11             למעשה                 O
         12             משמשים                O
         13             עובדים                O
         14             שכירים                O
         15             זולים                 O
         16             .                     O
2        1              תופעה                 O
         2              זו                    O
         3              התבררה                O
         4              אתמול                 O
 

In [9]:
def get_token_biose(df):
    def _single_token_conversion(tok):
        all_bio = tok.biose_only.tolist()
        all_typ = set(tok.ner_type.dropna().tolist())
        if len(all_typ)>1:
            return 'O'
        if 'S' in all_bio:
            new_bio = 'S'
        elif 'B' in all_bio and 'E' in all_bio:
            new_bio = 'S'
        elif 'B' in all_bio:
            new_bio = 'B'
        elif 'E' in all_bio:
            new_bio = 'E'
        elif 'I' in all_bio:
            new_bio = 'I'
        else:
            return 'O'
        return new_bio+'-'+all_typ.pop()
    
    df = (spdf
          .groupby(['sent_id', 'misc_token_id', 'misc_token_str'])
          .apply(_single_token_conversion)
          .reset_index().rename(columns={0:'biose'})
         )
    return df
    
tok_ner = get_token_biose(spdf)

In [10]:
tok_ner.head()

Unnamed: 0,sent_id,misc_token_id,misc_token_str,biose
0,1,1,עשרות,O
1,1,2,אנשים,O
2,1,3,מגיעים,O
3,1,4,מתאילנד,S-GPE
4,1,5,לישראל,S-GPE


In [11]:
tok_ner.head(30)

Unnamed: 0,sent_id,misc_token_id,misc_token_str,biose
0,1,1,עשרות,O
1,1,2,אנשים,O
2,1,3,מגיעים,O
3,1,4,מתאילנד,S-GPE
4,1,5,לישראל,S-GPE
5,1,6,כשהם,O
6,1,7,נרשמים,O
7,1,8,כמתנדבים,O
8,1,9,",",O
9,1,10,אך,O


In [12]:
spdf.columns[spdf.columns.str.startswith('feats')]

Index(['feats_gen', 'feats_num', 'feats_per', 'feats_HebBinyan', 'feats_tense',
       'feats_suf_gen', 'feats_suf_num', 'feats_suf_per', 'feats_polar'],
      dtype='object')

In [13]:
from tqdm import tqdm
# Create and register a new `tqdm` instance with `pandas`
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [14]:
#**{'upostag': ','.join(x.upostag.tolist())}
features = (spdf
            .groupby(['sent_id', 'misc_token_id', 'misc_token_str'])
            .progress_apply(lambda x: pd.Series({f: '^'.join(x[f].fillna('').tolist()) for f in spdf.columns[spdf.columns.str.startswith('feats')]}
            ))
           )

100%|██████████| 114654/114654 [07:46<00:00, 245.74it/s]


In [15]:
features.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,feats_gen,feats_num,feats_per,feats_HebBinyan,feats_tense,feats_suf_gen,feats_suf_num,feats_suf_per,feats_polar
sent_id,misc_token_id,misc_token_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,עשרות,F,P,,,,,,,
1,2,אנשים,M,P,,,,,,,
1,3,מגיעים,M,P,A,HIFIL,,,,,
1,4,מתאילנד,^,^,^,^,^,^,^,^,^
1,5,לישראל,^,^,^,^,^,^,^,^,^
1,6,כשהם,^M,^P,^3,^,^,^,^,^,^
1,7,נרשמים,M,P,A,NIFAL,,,,,
1,8,כמתנדבים,^M,^P,^,^,^,^,^,^,^
1,9,",",,,,,,,,,
1,10,אך,,,,,,,,,


In [16]:
upos_form_lemma_deprel = (spdf
            .groupby(['sent_id', 'misc_token_id', 'misc_token_str'])
            .progress_apply(lambda x: pd.Series({'upostag': '^'.join(x.upostag.tolist()),
                                                 'form': '^'.join(x.form.tolist()),
                                                 'lemma': '^'.join(x.lemma.tolist()),
                                                 'deprel': '^'.join(x.deprel.tolist()),}
            ))
           )

100%|██████████| 114654/114654 [01:22<00:00, 1385.64it/s]


In [17]:
tokens_ner_with_upos_feats = pd.concat([tok_ner.set_index(['sent_id', 'misc_token_id', 'misc_token_str']), upos_form_lemma_deprel, features], axis=1)
tokens_ner_with_upos_feats = (tokens_ner_with_upos_feats
                              .reset_index()
                              .set_index(['sent_id', 'misc_token_id', 'misc_token_str'])
                              .sort_index()
                              .assign(set = lambda x: x.index.get_level_values('sent_id').map(spdf[['sent_id', 'set']].drop_duplicates().set_index('sent_id')['set']))
                             )
tokens_ner_with_upos_feats.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,biose,upostag,form,lemma,deprel,feats_gen,feats_num,feats_per,feats_HebBinyan,feats_tense,feats_suf_gen,feats_suf_num,feats_suf_per,feats_polar,set
sent_id,misc_token_id,misc_token_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,עשרות,O,CDT,עשרות,עשר,num,F,P,,,,,,,,dev
1,2,אנשים,O,NN,אנשים,איש,subj,M,P,,,,,,,,dev
1,3,מגיעים,O,BN,מגיעים,הגיע,ROOT,M,P,A,HIFIL,,,,,,dev
1,4,מתאילנד,S-GPE,PREPOSITION^NNP,מ^תאילנד,מ^תאילנד,prepmod^pobj,^,^,^,^,^,^,^,^,^,dev
1,5,לישראל,S-GPE,PREPOSITION^NNP,ל^ישראל,ל^ישראל,prepmod^pobj,^,^,^,^,^,^,^,^,^,dev


In [18]:
tokens_ner_with_upos_feats.deprel.value_counts()

punct                    17683
prepmod^pobj              8542
gobj                      6106
subj                      5416
conj                      5159
ROOT                      5037
prepmod                   4245
def^gobj                  4235
amod                      3912
advmod                    3658
pobj                      3600
nn                        2873
def^amod                  2859
obj                       2827
prepmod^def^pobj          2586
def^subj                  1935
rcmod^relcomp             1856
xcomp                     1664
acc                       1604
posspmod                  1432
def^pobj                  1242
ccomp                     1182
hd                        1128
comp                      1100
neg                       1053
det                       1024
dep                        915
cop                        821
appos                      807
relcomp                    792
                         ...  
subj^acc^obj                 1
hd^dep^g

In [32]:
tn = tokens_ner_with_upos_feats

In [22]:
import os
yap_output_dir = 'new_yap_outputs'

In [24]:
import os
import numpy as np

In [25]:
def make_conll_df(path, add_head_stuff=False):
    # CoNLL file is tab delimeted with no quoting
    # quoting=3 is csv.QUOTE_NONE
    df = (pd.read_csv(path, sep='\t', header=None, quoting=3, comment='#',
                names = ['id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'])
                # add sentence labels
                .assign(sent = lambda x: (x.id==1).cumsum())
                # replace bad root dependency tags
                .replace({'DEPREL': {'prd': 'ROOT'}})
               )
    
    if add_head_stuff:
        df = df.merge(df[['ID', 'FORM', 'sent', 'UPOS']].rename(index=str, columns={'FORM': 'head_form', 'UPOS': 'head_upos'}).set_index(['sent', 'ID']),
               left_on=['sent', 'HEAD'], right_index=True, how='left')
    return df

def read_lattices(path):
    df = (pd.read_csv(path, sep='\t', header=None, quoting=3, 
                names = ['ID1', 'ID2', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'misc_token_id'])
                # add sentence labels
                .assign(sent = lambda x: (x.ID1==0).cumsum())
               )
    return df

flatten = lambda l: [item for sublist in l for item in sublist]

def get_feats(s):
    if s!='_' and s is not None and s is not np.nan:
        feats = OrderedDict()
        for f in s.split('|'):
            k,v = f.split('=')
            k='feats_'+k
            if k not in feats:
                feats[k] = v
            else:
                feats[k] = feats[k]+','+v
        return pd.Series(feats)
    else:
        return pd.Series()

def get_yap_output_df(tokens_path, dep_path, map_path):
    tokens = dict(flatten([[(str(j+1)+'_'+str(i+1), tok) for i, tok in enumerate(sent.split('\n'))]
              for j, sent in 
              enumerate(open(os.path.join(yap_output_dir, tokens_path), 'r').read().split('\n\n'))]))
    lattices = read_lattices(map_path)
    dep = make_conll_df(dep_path)
    df = (pd.concat([dep, lattices.misc_token_id], axis=1)
          .assign(sent_tok = lambda x: x.sent.astype(str) + '_' + x.misc_token_id.astype(str))
          .assign(misc_token_str = lambda x: x.sent_tok.map(tokens))
          .drop('sent_tok', axis=1)
          )
    df = pd.concat([df, df.feats.apply(get_feats)], axis=1).drop('feats', axis=1)
    return df


yap_dev = get_yap_output_df('spmrl_dev_tokens.txt', 'new_yap_outputs/spmrl_dev_dep.conll', 'new_yap_outputs/spmrl_dev_map.conll')
yap_test = get_yap_output_df('spmrl_test_tokens.txt', 'new_yap_outputs/spmrl_test_dep.conll', 'new_yap_outputs/spmrl_test_map.conll')


In [26]:
yap_dev.head()

Unnamed: 0,id,form,lemma,upostag,xpostag,head,deprel,deps,misc,sent,misc_token_id,misc_token_str,feats_gen,feats_num,feats_per,feats_tense,feats_suf_gen,feats_suf_num,feats_suf_per
0,1,עשרות,עשר,CDT,CDT,2,num,_,_,1,1,עשרות,F,P,,,,,
1,2,אנשים,איש,NN,NN,3,subj,_,_,1,2,אנשים,M,P,,,,,
2,3,מגיעים,הגיע,BN,BN,14,conj,_,_,1,3,מגיעים,M,P,A,,,,
3,4,מ,מ,PREPOSITION,PREPOSITION,3,comp,_,_,1,4,מתאילנד,,,,,,,
4,5,תאילנד,תאילנד,NNP,NNP,4,pobj,_,_,1,4,מתאילנד,F,S,,,,,


In [27]:
#**{'upostag': ','.join(x.upostag.tolist())}
yap_dev_feats = (yap_dev
            .groupby(['sent', 'misc_token_id', 'misc_token_str'])
            .apply(lambda x: pd.Series({f: '^'.join(x[f].fillna('').tolist()) for f in yap_dev.columns[yap_dev.columns.str.startswith('feats')]}
            ))
           )
yap_dev_tok = (yap_dev
            .groupby(['sent', 'misc_token_id', 'misc_token_str'])
            .apply(lambda x: pd.Series({ 'upostag': '^'.join(x.upostag.fillna('').tolist()),
                                         'form': '^'.join(x.form.fillna('').tolist()),
                                         'lemma': '^'.join(x.lemma.fillna('').tolist()),
                                         'deprel': '^'.join(x.deprel.fillna('').tolist()),}
            ))
           )
yap_dev_tok = pd.concat([yap_dev_tok,yap_dev_feats], axis=1)

In [28]:
yap_test_feats = (yap_test
            .groupby(['sent', 'misc_token_id', 'misc_token_str'])
            .apply(lambda x: pd.Series({f: ','.join(x[f].fillna('').tolist()) for f in yap_test.columns[yap_test.columns.str.startswith('feats')]}
            ))
           )
yap_test_tok = (yap_test
            .groupby(['sent', 'misc_token_id', 'misc_token_str'])
            .apply(lambda x: pd.Series({ 'upostag': '^'.join(x.upostag.fillna('').tolist()),
                                         'form': '^'.join(x.form.fillna('').tolist()),
                                         'lemma': '^'.join(x.lemma.fillna('').tolist()),
                                         'deprel': '^'.join(x.deprel.fillna('').tolist()),}
            ))
           )
yap_test_tok = pd.concat([yap_test_tok, yap_test_feats], axis=1)

In [33]:
dev_sent_id_map = tn[tn.set=='dev'].reset_index().sent_id.drop_duplicates().reset_index().drop('index', axis=1)['sent_id']
dev_sent_id_map.index = dev_sent_id_map.index+1
test_sent_id_map = tn[tn.set=='test'].reset_index().sent_id.drop_duplicates().reset_index().drop('index', axis=1)['sent_id']
test_sent_id_map.index = test_sent_id_map.index+1
yap_dev_tok = (yap_dev_tok
               .reset_index()
               .assign(sent_id = lambda x: x['sent'].map(dev_sent_id_map))
               .set_index(['sent', 'misc_token_id', 'misc_token_str']))
yap_test_tok = (yap_test_tok.reset_index()
               .assign(sent_id = lambda x: x.sent.map(test_sent_id_map))
               .set_index(['sent', 'misc_token_id', 'misc_token_str']))

In [34]:
yap_dev_tok.feats_gen.value_counts()

        3097
M       1800
^M      1112
F        882
^F       722
F,M      247
^^M      245
^        226
^^F      117
^F,M      64
^^         9
^^^F       6
^^^M       3
M^^M       1
Name: feats_gen, dtype: int64

In [36]:
tokens_ner_with_upos_feats.feats_gen.value_counts()

           43119
M          21339
^M         15952
F          11157
^F          9562
^           3936
F,M         3590
^^M         2781
^^F         1629
^F,M        1132
^^           225
^^^M          94
^^^F          78
^^F,M         40
^^^            7
^M^^M          4
^F,M^^M        2
M^^M           2
^^^^M          2
M^^F,M         1
F,M^^M         1
^^^F,M         1
Name: feats_gen, dtype: int64

In [None]:
spdf_fixed = pd.read_csv('align/spdf_fixed.csv.gz', compression='gzip')
spdf_fixed.feats.value_counts()

In [None]:
uddf_fixed = pd.read_csv('align/uddf_fixed.csv.gz', compression='gzip')
uddf_fixed.feats.value_counts()