In [1]:
import pickle
import os
from transformers import AutoTokenizer

In [2]:
bert_type = 'bert-base-multilingual-cased'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(bert_type)

In [4]:
features = {
    '*': 0,
    'IAV': 1,
    'IRV': 2,
    'LVC.cause': 3,
    'LVC.full': 4,
    'LS.ICV': 5,
    'MVC': 6,
    'VID': 7,
    'VPC.full': 8,
    'VPC.semi': 9,
#     '<unlabeled>': 9
}
len(features.keys())

10

In [5]:
'IAV_' in features.keys()

False

In [6]:
features.get('VID', 0)

7

In [7]:
'1:LVC.full;2:LVC.full'.split(';')

['1:LVC.full', '2:LVC.full']

In [8]:
def _extract_labels(feature, current_codes):
    feats = feature.split(';')
    if len(feats) == 1:
        feats = feats[0].split(':')
        if len(feats) == 1:
            label = current_codes.get(feats[0], features['*'])
            label = features.get(label)
        else:
            label = features[feats[1]]
            current_codes[feats[0]] = feats[1]
    elif len(feats) > 1:
        _feats = feats[0].split(':')
        if len(_feats) == 1:
            label = current_codes.get(_feats[0], features['*'])
            label = features.get(label)
        else:
            label = features[_feats[1]]
            current_codes[_feats[0]] = _feats[1]
        for f in feats[1:]:
            n = f.split(':')
            if len(n) > 1:
                current_codes[n[0]] = n[1]
    return label, current_codes

In [9]:
_extract_labels('1:VID', {})

(7, {'1': 'VID'})

In [10]:
_extract_labels('1:LVC.full;2:LVC.full', {})

(4, {'1': 'LVC.full', '2': 'LVC.full'})

In [11]:
_extract_labels('1;2:LVC.full', {'1': 'LVC.full'})

(4, {'1': 'LVC.full', '2': 'LVC.full'})

In [12]:
_extract_labels('1:LVC.full', {'1': 'LVC.full'})

(4, {'1': 'LVC.full'})

In [13]:
_extract_labels('1;2', {'1': 'LVC.full'})

(4, {'1': 'LVC.full'})

In [14]:
_extract_labels('2:LVC.full', {'1': 'LVC.full'})

(4, {'1': 'LVC.full', '2': 'LVC.full'})

In [15]:
_extract_labels('1', {'1': 'LVC.full'})

(4, {'1': 'LVC.full'})

In [16]:
_extract_labels('2;3:VID', {'1': 'LVC.full', '2': 'VID'}) 

(7, {'1': 'LVC.full', '2': 'VID', '3': 'VID'})

In [17]:
_extract_labels('*', {'1': 'LVC.full', '2': 'VID'}) 

(None, {'1': 'LVC.full', '2': 'VID'})

In [18]:
def load_and_tokenize_dataset(train_files, tokenizer, train=True, binary=True):

    if len(train_files) == 0:
        files = []
        for root, _, files in os.walk('data/'):
            for _file in files:
                if train:
                    if _file == 'train.cupt':
                        files.append(os.path.join(root, _file))
                else:
                    if _file == 'dev.cupt':
                        files.append(os.path.join(root, _file))

    else:
        files = train_files

    for _file in files:
        print(_file)
    cls = tokenizer.encode('[CLS]')[1]
    sep = tokenizer.encode('[SEP]')[1]

    sentences, labels = [], []
    for _file in files:
        with open(_file) as text:
            tmp_line = []
            tmp_label = []
            current_codes = {}
            for i, line in enumerate(text):
#                 print(line)
                if line == '\n':
                    sentences.append([cls] + tmp_line + [sep])
                    labels.append([0] + tmp_label + [0])
                    tmp_line = []
                    tmp_label = []
                    current_codes = {}
                elif not line.startswith('#'):
                    feats = line.replace('\n', '').split('\t')
                    if not '-' in feats[0]:                        
                        _label = [0] if feats[10] == '*' else ([1] if binary else _extract_labels(feats[10], current_codes))
                        if len(_label) > 1:
                            _label, current_codes = _label
                        else:
                            _label = _label[0]
                        tokens = tokenizer.encode(feats[1])
                        tokens = tokens[1:-1]
                        _label = [_label] * len(tokens)
                        tmp_line += tokens
                        tmp_label += _label
    return sentences, labels

In [19]:
code = 'DE'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
de = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/DE/train.cupt
data/DE/dev.cupt


In [20]:
code = 'EL'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
el = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/EL/train.cupt
data/EL/dev.cupt


In [21]:
code = 'EU'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
eu = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/EU/train.cupt
data/EU/dev.cupt


In [22]:
code = 'FR'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
fr = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/FR/train.cupt
data/FR/dev.cupt


In [23]:
code = 'GA'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
ga = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/GA/train.cupt
data/GA/dev.cupt


In [24]:
code = 'HE'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
he = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/HE/train.cupt
data/HE/dev.cupt


In [25]:
code = 'HI'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
hi = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/HI/train.cupt
data/HI/dev.cupt


In [26]:
code = 'IT'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
it = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/IT/train.cupt
data/IT/dev.cupt


In [27]:
code = 'PL'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
pl = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/PL/train.cupt
data/PL/dev.cupt


In [28]:
code = 'PT'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
pt = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/PT/train.cupt
data/PT/dev.cupt


In [29]:
code = 'RO'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
ro = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/RO/train.cupt
data/RO/dev.cupt


In [30]:
code = 'SV'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
sv = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/SV/train.cupt
data/SV/dev.cupt


In [31]:
code = 'TR'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
tr = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/TR/train.cupt
data/TR/dev.cupt


In [32]:
code = 'ZH'
x_train, y_train = load_and_tokenize_dataset(['data/{}/train.cupt'.format(code)], tokenizer, binary=False)
x_dev, y_dev = load_and_tokenize_dataset(['data/{}/dev.cupt'.format(code)], tokenizer, binary=False)
zh = {
    'x_train': x_train, 
    'y_train': y_train,
    'x_dev': x_dev, 
    'y_dev': y_dev
}

data/ZH/train.cupt
data/ZH/dev.cupt


In [33]:
data = {
    'DE': de,
    'EL': el,
    'EU': eu,
    'FR': fr,    
    'GA': ga,
    'HE': he,
    'HI': hi,
    'IT': it,
    'PL': pl,
    'PT': pt,
    'RO': ro,
    'SV': sv,
    'TR': tr,
    'ZH': zh,
}

In [34]:
with open('data/{}.multilabel.tokenized.all.pkl'.format(bert_type), 'wb') as f:
    pickle.dump(data, f)