In [2]:
import pandas as pd
from conllu import parse


In [38]:
from collections import OrderedDict

def get_conllu_df(path):
    with open(path, 'r', encoding='utf8') as f:
        sp_conllu = parse(f.read())
    fixed = []
    for tl in sp_conllu:
        for tok in tl:
            t = OrderedDict(tok)
            if type(t['id']) is not tuple:
                if t['feats'] is not None:
                    t.update({'feats_'+f: v for f, v in t['feats'].items()})
                del(t['feats'])
                if t['misc'] is not None:
                    t.update({'misc_'+f: v for f, v in t['misc'].items()})
                del(t['misc'])
                t.update(tl.metadata)
                fixed.append(t)
    df = pd.DataFrame(fixed)
    return df

        
spdf = get_conllu_df('align/spmrl_fixed.conllu')
    

In [43]:
spdf[['biose_only', 'ner_type']] = spdf.misc_biose.str.split('-', expand=True)

In [45]:
spdf.ner_type.value_counts()

ORG    6380
PER    3940
GPE    2249
LOC     910
FAC     503
WOA     405
EVE     209
DUC      69
ANG      43
Name: ner_type, dtype: int64

In [44]:
(spdf
          .groupby(['sent_id', 'misc_token_id', 'misc_token_str']).apply(lambda x: ','.join(x.biose_only.tolist())))

sent_id  misc_token_id  misc_token_str
1        1              עשרות                 O
         10             אך                    O
         11             למעשה                 O
         12             משמשים                O
         13             עובדים                O
         14             שכירים                O
         15             זולים                 O
         16             .                     O
         2              אנשים                 O
         3              מגיעים                O
         4              מתאילנד             O,S
         5              לישראל              O,S
         6              כשהם                O,O
         7              נרשמים                O
         8              כמתנדבים            O,O
         9              ,                     O
10       1              ישראל                 B
         10             ממלא                  O
         11             מקום                  O
         12             שר                    O
 

In [47]:
x = set([1,2,3])
x.pop()

1

In [55]:
def get_token_biose(df):
    def _single_token_conversion(tok):
        all_bio = tok.biose_only.tolist()
        all_typ = set(tok.ner_type.dropna().tolist())
        if len(all_typ)>1:
            return 'O'
        if 'S' in all_bio:
            new_bio = 'S'
        elif 'B' in all_bio and 'E' in all_bio:
            new_bio = 'S'
        elif 'B' in all_bio:
            new_bio = 'B'
        elif 'E' in all_bio:
            new_bio = 'E'
        elif 'I' in all_bio:
            new_bio = 'I'
        else:
            return 'O'
        return new_bio+'-'+all_typ.pop()
    
    df = (spdf
          .groupby(['sent_id', 'misc_token_id', 'misc_token_str'])
          .apply(_single_token_conversion)
          .reset_index().rename(columns={0:'biose'})
         )
    return df
    
tok_ner = get_token_biose(spdf)

In [56]:
tok_ner.head()

Unnamed: 0,sent_id,misc_token_id,misc_token_str,biose
0,1,1,עשרות,O
1,1,10,אך,O
2,1,11,למעשה,O
3,1,12,משמשים,O
4,1,13,עובדים,O


In [58]:
tok_ner.head(30)

Unnamed: 0,sent_id,misc_token_id,misc_token_str,biose
0,1,1,עשרות,O
1,1,10,אך,O
2,1,11,למעשה,O
3,1,12,משמשים,O
4,1,13,עובדים,O
5,1,14,שכירים,O
6,1,15,זולים,O
7,1,16,.,O
8,1,2,אנשים,O
9,1,3,מגיעים,O
