In [1]:
import pickle
import os
from transformers import AutoTokenizer

In [2]:
bert_type = 'distilbert-base-multilingual-cased'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(bert_type)

In [4]:
features = {
    '*': 0,
    'IAV': 1,
    'IRV': 2,
    'LVC.cause': 3,
    'LVC.full': 4,
    'MVC': 5,
    'VID': 6,
    'VPC.full': 7,
    'VPC.semi': 8,
#     '<unlabeled>': 9
}
len(features.keys())

10

In [5]:
'IAV_' in features.keys()

False

In [6]:
features.get('VID', 0)

6

In [7]:
'1:LVC.full;2:LVC.full'.split(';')

['1:LVC.full', '2:LVC.full']

In [8]:
def _extract_labels(feature, current_codes):
    feats = feature.split(';')
    if len(feats) == 1:
        feats = feats[0].split(':')
        if len(feats) == 1:
            label = current_codes.get(feats[0], features['*'])
            label = features.get(label)
        else:
            label = features[feats[1]]
            current_codes[feats[0]] = feats[1]
    elif len(feats) > 1:
        _feats = feats[0].split(':')
        if len(_feats) == 1:
            label = current_codes.get(_feats[0], features['*'])
            label = features.get(label)
        else:
            label = features[_feats[1]]
            current_codes[_feats[0]] = _feats[1]
        for f in feats[1:]:
            n = f.split(':')
            if len(n) > 1:
                current_codes[n[0]] = n[1]
    return label, current_codes

In [9]:
_extract_labels('1:VID', {})

(6, {'1': 'VID'})

In [10]:
_extract_labels('1:LVC.full;2:LVC.full', {})

(4, {'1': 'LVC.full', '2': 'LVC.full'})

In [11]:
_extract_labels('1;2:LVC.full', {'1': 'LVC.full'})

(4, {'1': 'LVC.full', '2': 'LVC.full'})

In [12]:
_extract_labels('1:LVC.full', {'1': 'LVC.full'})

(4, {'1': 'LVC.full'})

In [13]:
_extract_labels('1;2', {'1': 'LVC.full'})

(4, {'1': 'LVC.full'})

In [14]:
_extract_labels('2:LVC.full', {'1': 'LVC.full'})

(4, {'1': 'LVC.full', '2': 'LVC.full'})

In [15]:
_extract_labels('1', {'1': 'LVC.full'})

(4, {'1': 'LVC.full'})

In [16]:
_extract_labels('2;3:VID', {'1': 'LVC.full', '2': 'VID'}) 

(6, {'1': 'LVC.full', '2': 'VID', '3': 'VID'})

In [17]:
_extract_labels('*', {'1': 'LVC.full', '2': 'VID'}) 

(None, {'1': 'LVC.full', '2': 'VID'})

In [17]:
def load_and_tokenize_dataset(train_files, tokenizer, train=True, binary=True):

    if len(train_files) == 0:
        files = []
        for root, _, files in os.walk('data/'):
            for _file in files:
                if train:
                    if _file == 'train.cupt':
                        files.append(os.path.join(root, _file))
                else:
                    if _file == 'dev.cupt':
                        files.append(os.path.join(root, _file))

    else:
        files = train_files

    for _file in files:
        print(_file)
    cls = tokenizer.encode('[CLS]')[1]
    sep = tokenizer.encode('[SEP]')[1]

    sentences, labels = [], []
    for _file in files:
        with open(_file) as text:
            tmp_line = []
            tmp_label = []
            current_codes = {}
            for i, line in enumerate(text):
#                 print(line)
                if line == '\n':
                    sentences.append([cls] + tmp_line + [sep])
                    labels.append([0] + tmp_label + [0])
                    tmp_line = []
                    tmp_label = []
                    current_codes = {}
                elif not line.startswith('#'):
                    feats = line.split()
                    if not '-' in feats[0]:                        
                        _label = [0] if feats[10] == '*' else ([1] if binary else _extract_labels(feats[10], current_codes))
                        if len(_label) > 1:
                            _label, current_codes = _label
                        else:
                            _label = _label[0]
                        tokens = tokenizer.encode(feats[1])
                        tokens = tokens[1:-1]
                        _label = [_label] * len(tokens)
                        tmp_line += tokens
                        tmp_label += _label
    return sentences, labels

In [18]:
code = 'DE'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
de = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/DE/train.cupt
data/DE/dev.cupt


In [19]:
code = 'GA'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
ga = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/GA/train.cupt
data/GA/dev.cupt


In [20]:
code = 'HI'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
hi = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/HI/train.cupt
data/HI/dev.cupt


In [21]:
code = 'PT'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
pt = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/PT/train.cupt
data/PT/dev.cupt


In [22]:
code = 'ZH'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
zh = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/ZH/train.cupt
data/ZH/dev.cupt


In [23]:
data = {
    'DE': de,
    'GA': ga,
    'HI': hi,
    'PT': pt,
    'ZH': zh,
}

In [24]:
with open('data/{}.multilabel.tokenized.pkl'.format(bert_type), 'wb') as f:
    pickle.dump(data, f)

In [25]:
for x, y, in zip(de['x_train'], de['y_train']):
    if None in y:
        print(len(x), tokenizer.decode(x))
        print(len(y), y)
        for i, j in zip(x, y):
            print(i, j)
        break

In [17]:
text = """# source_sent_id = . . newscrawl-1345
# text = Dass eben diese Einstellung beim Pokalsieg gegen Langenfeld da war, hat der Coach erfreut zur Kenntnis genommen: "Das hat auch als Trainer richtig Spaß gemacht.
1	Dass	daß	SCONJ	KOUS	_	6	mark	_	_	*
2	eben	eben	ADV	ADV	_	6	advmod	_	_	*
3	diese	dies	PRON	PDAT	Case=Nom|Gender=Fem|Number=Sing|PronType=Dem	4	det	_	_	*
4	Einstellung	Einstellung	NOUN	NN	Case=Nom|Gender=Fem|Number=Sing	6	nsubj	_	_	*
5	beim	beim	ADJ	NE	Case=Nom|Gender=Masc|Number=Sing	6	amod	_	_	*
6	Pokalsieg	Pokalsieg	NOUN	NN	Case=Dat|Gender=Masc|Number=Sing	18	advcl	_	_	*
7	gegen	gegen	ADP	APPR	_	8	case	_	_	*
8	Langenfeld	Langenfeld	PROPN	NE	Case=Dat|Gender=Neut|Number=Sing	6	nmod	_	_	*
9	da	da	PROPN	ADV	_	8	appos	_	_	*
10	war	sein	AUX	ADV	_	6	cop	_	SpaceAfter=No	*
11	,	,	PUNCT	$,	_	6	punct	_	_	*
12	hat	haben	AUX	VAFIN	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	18	aux	_	_	*
13	der	der	DET	ART	Case=Nom|Definite=Def|Gender=Masc|Number=Sing|PronType=Art	14	det	_	_	*
14	Coach	Coach	PROPN	NE	Case=Nom|Gender=Masc|Number=Sing	18	nsubj	_	_	*
15	erfreut	erfreut	ADJ	ADJD	_	18	advmod	_	_	*
16	zur	zur	NOUN	NN	Case=Nom|Gender=Fem|Number=Sing	18	nsubj	_	_	1:LVC.full
17	Kenntnis	Kenntnis	NOUN	NN	Case=Nom|Gender=Fem|Number=Sing	18	obj	_	_	1
18	genommen	nehmen	VERB	VVPP	VerbForm=Part	0	root	_	SpaceAfter=No	1
19	:	:	PUNCT	$.	_	28	punct	_	_	*
20	"	"	PUNCT	$(	_	28	punct	_	SpaceAfter=No	*
21	Das	der	PRON	PDS	Case=Nom|Gender=Neut|Number=Sing|PronType=Dem	28	nsubj	_	_	*
22	hat	haben	AUX	VAFIN	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	28	aux	_	_	2:VID
23	auch	auch	ADV	ADV	_	25	advmod	_	_	*
24	als	als	ADP	KOKOM	_	25	case	_	_	*
25	Trainer	Trainer	NOUN	NN	Case=Nom|Gender=Masc|Number=Sing	28	obl	_	_	*
26	richtig	richtig	ADJ	ADJD	_	27	amod	_	_	*
27	Spaß	Spaß	NOUN	NN	Case=Acc|Gender=Masc|Number=Sing	28	obj	_	_	2;3:VID
28	gemacht	machen	VERB	VVPP	VerbForm=Part	18	parataxis	_	SpaceAfter=No	3
29	.	.	PUNCT	$.	_	18	punct	_	_	*""".split('\n') + ['\n']

In [22]:
binary = False
tmp_line = []
tmp_label = []
current_codes = {}
cls = tokenizer.encode('[CLS]')[1]
sep = tokenizer.encode('[SEP]')[1]

sentences, labels = [], []
for i, line in enumerate(text):
#                 print(line)
    if line == '\n':
        sentences.append([cls] + tmp_line + [sep])
        labels.append([0] + tmp_label + [0])
        tmp_line = []
        tmp_label = []
        current_codes = {}
    elif not line.startswith('#'):
        feats = line.split()
        if not '-' in feats[0]:                        
            _label = [0] if feats[10] == '*' else ([1] if binary else _extract_labels(feats[10], current_codes))
            if len(_label) > 1:
                _label, current_codes = _label
            else:
                _label = _label[0]
            tokens = tokenizer.encode(feats[1])
            print(tokens)
            tokens = tokens[1:-1]
            _label = [_label] * len(tokens)
            print(_label)
            tmp_line += tokens
            tmp_label += _label

[101, 100146, 102]
[0]
[101, 173, 10965, 102]
[0, 0]
[101, 12750, 102]
[0]
[101, 104731, 102]
[0]
[101, 12632, 102]
[0]
[101, 47934, 88610, 102]
[0, 0]
[101, 11930, 102]
[0]
[101, 32384, 10115, 14457, 102]
[0, 0, 0]
[101, 10143, 102]
[0]
[101, 10338, 102]
[0]
[101, 117, 102]
[0]
[101, 11250, 102]
[0]
[101, 10118, 102]
[0]
[101, 38345, 102]
[0]
[101, 10163, 37135, 11159, 102]
[0, 0, 0]
[101, 10736, 102]
[4]
[101, 80669, 102]
[4]
[101, 38023, 102]
[4]
[101, 131, 102]
[0]
[101, 107, 102]
[0]
[101, 10672, 102]
[0]
[101, 11250, 102]
[6]
[101, 10515, 102]
[0]
[101, 10223, 102]
[0]
[101, 31964, 102]
[0]
[101, 33250, 18735, 102]
[0, 0]
[101, 64766, 17422, 102]
[6, 6]
[101, 37310, 102]
[6]
[101, 119, 102]
[0]


In [28]:
sentences

[[101,
  100146,
  173,
  10965,
  12750,
  104731,
  12632,
  47934,
  88610,
  11930,
  32384,
  10115,
  14457,
  10143,
  10338,
  117,
  11250,
  10118,
  38345,
  10163,
  37135,
  11159,
  10736,
  80669,
  38023,
  131,
  107,
  10672,
  11250,
  10515,
  10223,
  31964,
  33250,
  18735,
  64766,
  17422,
  37310,
  119,
  102]]

In [19]:
features['<unlabeled>']

9

In [24]:
for code in data:
    for y in data[code]['y_train']:
        if 9 in y:
            print(y)
    for y in data[code]['y_dev']:
        if 9 in y:
            print(y)

In [25]:
import pickle

In [20]:
with open('data/bert-base-multilingual-cased.multilabel.tokenized.pkl', 'rb') as f:
    data = pickle.load(f)

In [21]:
data['GA']['y_train']

[[0,
  6,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 