In [1]:
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer as twt

In [2]:
import jsonlines

annotated_data = []

with jsonlines.open('../data/annotated-ner.json1') as reader:
    for obj in reader:
        # Getting only the documents with annotations
        if len(obj['labels']) > 0:
            annotated_data.append(obj)

In [3]:
def merge_adjacent_labels(labels):
    merged_labels = []
    prev_label = None
    new_label = None
    
    for label in labels:
        if prev_label == None:
            new_label = [label[0], label[1], label[2]]
            prev_label = label
            continue
        
        if prev_label[1] + 1 == label[0] and prev_label[2] == label[2]:
            new_label[1] = label[1]
        else:
            merged_labels.append(new_label)
            new_label = [label[0], label[1], label[2]]
        prev_label = label
    return merged_labels

with jsonlines.open('../data/annotated-ner-semi-supervised-v2.json1') as reader:
    for obj in reader:
        # Getting only the documents with annotations
        if len(obj['labels']) > 0:
            obj['labels'] = merge_adjacent_labels(obj['labels'])
            annotated_data.append(obj)

In [4]:
annotated_data[0]

{'id': 1,
 'text': 'РЕШЕНИЕ\n\n№53\n\nгр. София, 26.07.2018 год. В ИМЕТО НА НА Р О Д А \n\n\n\nВЪРХОВНИЯТ КАСАЦИОНЕН СЪД на Република България, ІІ гражданско отделение, в открито съдебно заседание на шестнадесети април две хиляди и осемнадесета година, в състав:\n \n\nПРЕДСЕДАТЕЛ: ЕМАНУЕЛА БАЛЕВСКА\n\nЧЛЕНОВЕ: СНЕЖАНКА НИКОЛОВА\n\nГЕРГАНА НИКОВА\n \nпри участието на секретаря Т. Иванова, като разгледа докладваното от съдията Николова гр. д. №53 по описа на Върховния касационен съд за 2017 год. на ІІ г. о. и за да се произнесе, взе предвид следното:\n\n\nПроизводството е по чл. 290 и сл. от ГПК. Образувано е по касационната жалба на Местно поделение Мюсюлманско настоятелство, [населено място], Велинградска община, представлявано от председателя С.',
 'meta': {},
 'annotation_approver': None,
 'labels': [[0, 12, 'REF-DOC'],
  [14, 23, 'LOC-CITY'],
  [25, 35, 'DATE'],
  [62, 92, 'ORG-COURT'],
  [96, 114, 'LOC-CTRY'],
  [172, 223, 'DATE'],
  [392, 400, 'PER-JDG'],
  [424, 448, 'ORG-COURT']

In [5]:
# This will be used to determine whether token span matches annotation span
def are_intersecting(b1, b2):
    if b1[0] >= b2[0] and b1[1] <= b2[1]:
        return True
    return False

# Getting labels as list of pairs (span_x, span_y, label)
def get_labels(raw_labels):
    return [((l[0], l[1]), l[2]) for l in raw_labels]

def map_tokens_with_labels(annotated_doc):
    # Sorting labels for idempotency 
    annotated_doc['labels'].sort(key=lambda x: x[0], reverse=False)
    labels = get_labels(annotated_doc['labels'])
    current_label = 0
    
    token_spans = list(twt().span_tokenize(annotated_doc['text']))
    tokens = twt().tokenize(annotated_doc['text'])
    
    mapped_tokens = []
    was_entity = False

    for token_span in zip(tokens, token_spans):
        if current_label >= len(labels):
            mapped_tokens.append((token_span, 'O'))
            continue
            
        if are_intersecting(token_span[1], labels[current_label][0]):
            mapped_tokens.append((token_span, labels[current_label][1]))
            was_entity = True
        else:
            mapped_tokens.append((token_span, 'O'))
            if was_entity:
                was_entity = False
                current_label += 1
    return mapped_tokens

In [6]:
map_tokens_with_labels(annotated_data[5])

[(('Ако', (0, 3)), 'O'),
 (('местното', (4, 12)), 'O'),
 (('поделение', (13, 22)), 'O'),
 (('на', (23, 25)), 'O'),
 (('вероизповеданието', (26, 43)), 'O'),
 (('е', (44, 45)), 'O'),
 (('заварено', (46, 54)), 'O'),
 (('юридическо', (55, 65)), 'O'),
 (('лице', (66, 70)), 'O'),
 (('по', (71, 73)), 'O'),
 (('смисъла', (74, 81)), 'O'),
 (('на', (82, 84)), 'O'),
 (('цитираната', (85, 95)), 'O'),
 (('норма', (96, 101)), 'O'),
 ((',', (101, 102)), 'O'),
 (('то', (103, 105)), 'O'),
 (('не', (106, 108)), 'O'),
 (('изгубва', (109, 116)), 'O'),
 (('своята', (117, 123)), 'O'),
 (('правосубектност', (124, 139)), 'O'),
 (('по', (140, 142)), 'O'),
 (('силата', (143, 149)), 'O'),
 (('ЗВ', (150, 152)), 'O'),
 ((',', (152, 153)), 'O'),
 (('дори', (154, 158)), 'O'),
 (('при', (159, 162)), 'O'),
 (('неизвършване', (163, 175)), 'O'),
 (('на', (176, 178)), 'O'),
 (('служебното', (179, 189)), 'O'),
 (('вписване.', (190, 199)), 'O'),
 (('Предвиденият', (200, 212)), 'O'),
 (('в', (213, 214)), 'O'),
 (('новия', (

In [7]:
df = pd.DataFrame([], columns=['token', 'span', 'sentence', 'tag'])

for i, doc in enumerate(annotated_data):
    mapped_tokens = map_tokens_with_labels(doc)
    for mapped_token in mapped_tokens: 
        df = df.append({
            'token': mapped_token[0][0], 
            'span': mapped_token[0][1], 
            'sentence': i, 
            'tag': mapped_token[1]
        }, ignore_index=True)

In [8]:
df

Unnamed: 0,token,span,sentence,tag
0,РЕШЕНИЕ,"(0, 7)",0,REF-DOC
1,№53,"(9, 12)",0,REF-DOC
2,гр.,"(14, 17)",0,O
3,София,"(18, 23)",0,LOC-CITY
4,",","(23, 24)",0,O
...,...,...,...,...
74158,на,"(576, 578)",552,O
74159,доводите,"(579, 587)",552,O
74160,на,"(588, 590)",552,O
74161,защитата,"(591, 599)",552,O


In [9]:
words = list(set(df["token"].values))
len(words)

8592

In [10]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(), s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [11]:
getter = SentenceGetter(df)
print(getter.get_next())

[('РЕШЕНИЕ', 'REF-DOC'), ('№53', 'REF-DOC'), ('гр.', 'O'), ('София', 'LOC-CITY'), (',', 'O'), ('26.07.2018', 'DATE'), ('год.', 'O'), ('В', 'O'), ('ИМЕТО', 'O'), ('НА', 'O'), ('НА', 'O'), ('Р', 'O'), ('О', 'O'), ('Д', 'O'), ('А', 'O'), ('ВЪРХОВНИЯТ', 'ORG-COURT'), ('КАСАЦИОНЕН', 'ORG-COURT'), ('СЪД', 'ORG-COURT'), ('на', 'O'), ('Република', 'LOC-CTRY'), ('България', 'LOC-CTRY'), (',', 'O'), ('ІІ', 'O'), ('гражданско', 'O'), ('отделение', 'O'), (',', 'O'), ('в', 'O'), ('открито', 'O'), ('съдебно', 'O'), ('заседание', 'O'), ('на', 'O'), ('шестнадесети', 'DATE'), ('април', 'DATE'), ('две', 'DATE'), ('хиляди', 'DATE'), ('и', 'DATE'), ('осемнадесета', 'DATE'), ('година', 'DATE'), (',', 'O'), ('в', 'O'), ('състав', 'O'), (':', 'O'), ('ПРЕДСЕДАТЕЛ', 'O'), (':', 'O'), ('ЕМАНУЕЛА', 'PER-OTH'), ('БАЛЕВСКА', 'PER-OTH'), ('ЧЛЕНОВЕ', 'O'), (':', 'O'), ('СНЕЖАНКА', 'PER-OTH'), ('НИКОЛОВА', 'PER-OTH'), ('ГЕРГАНА', 'O'), ('НИКОВА', 'PER-OTH'), ('при', 'O'), ('участието', 'O'), ('на', 'O'), ('секретаря'

In [12]:
import re

def matchesRegEx(regex, word):
    res = re.match(regex, word)
    if res:
        return True
    return False

def gazeteerMoney(word):
    moneyDictionary = ['лв', 'евро', 'долара', 'лева', '$', 'щ. д.', 'щ.д.']
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def gazeteerDate(word):
    moneyDictionary = ['месец', 'януари', 'февруари', 'март', 'април', 'май', 'юни', 'юли', 'август', 'септември', 'октомври', 'ноември', 'декември', 'година', 'год.', 'г.']
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def gazeteerCourt(word):
    dictionary = ["PC", "ОС", "СГС", "АДМС", "АС", "ВКС", "ВАС", 'СЪД', 'ЕСПЧ']
    for w in dictionary:
        res = re.match(w, word.upper())
        if res:
            return True
    return False
    
def gazeteerJudge(word):
    moneyDictionary = ['съд.', 'съдия']
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def gazeteerInstitution(word):
    moneyDictionary = ['БНТ', 'съдия', 'ЕС', 'КЗК', 'МВР', 'НАП', ]
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def gazeteerDocRef(word):
    moneyDictionary = ['бел', 'бр', 'ДВ', "к.д.", ]
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def gazeteerJudge(word):
    moneyDictionary = ['съд.', 'съдия']
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def gazeteerCountry(word):
    moneyDictionary = ['РБ', 'ГДР', 'КРБ', 'България', 'Сърбия', 'Румъния', 'САЩ']
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def gazeteerLaw(word):
    moneyDictionary = ['чл', 'ал', 'алинея', 'т', "НК", "НПК", "ГПК", "АПК", "СК", "ЗАНН", "ЗГС", "ЗЗД", "ЗЗК", "ЗЗП", "ЗИДАПК", "ЗИД", "ЗНА", "ЗРТ", "ЗХр", "ЗСВ"]
    for w in moneyDictionary:
        res = re.match(w, word)
        if res:
            return True
    return False

def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[:-2]': word[:-2],
        'word[:-1]': word[:-1],
        'word[-1:]': word[-1:],
        'word[-2:]': word[-2:],
        'ispunctuation': matchesRegEx("[,\.;:\?!-+\"]", word),
        'containsDash': matchesRegEx(".*-.*", word),
        'containsDot': matchesRegEx(".*\..*", word),
        'containsDigit': matchesRegEx(".*[0-9].*", word),
        'lonleyInitial': matchesRegEx("[А-Я]\.", word),
        'singleChar': matchesRegEx("[А-Яа-я]", word),
        'singleDigit': matchesRegEx("[0-9]", word),
        'hasQuote': matchesRegEx("[\"']", word),
        'endsWithDot': matchesRegEx(".*\.", word),
        'fourDigitsYear': matchesRegEx("\d{4}", word),
        'gazeteerMoney': gazeteerMoney(word),
        'gazeteerJudge': gazeteerJudge(word),
        'gazeteerCountry': gazeteerCountry(word),
        'gazeteerInstitution': gazeteerInstitution(word),
        'gazeteerDocRef': gazeteerDocRef(word),
        'gazeteerDate': gazeteerDate(word),
        'gazeteerCourt': gazeteerCourt(word),
        'gazeteerLaw': gazeteerLaw(word),
        'word.isupper()': word.isupper(),
        'word.islower()': word.islower(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:ispunctuation': matchesRegEx("[,\.;:\?!-+\"]", word1),
            '-1:containsDash': matchesRegEx(".*-.*", word1),
            '-1:containsDot': matchesRegEx(".*\..*", word1),
            '-1:containsDigit': matchesRegEx(".*[0-9].*", word1),
            '-1:singleDigit': matchesRegEx("[0-9]", word1),
            '-1:endsWithDot': matchesRegEx(".*\.", word1),
            '-1:gazeteerMoney': gazeteerMoney(word),
            '-1:gazeteerJudge': gazeteerJudge(word),
            '-1:gazeteerCountry': gazeteerCountry(word),
            '-1:gazeteerInstitution': gazeteerInstitution(word),
            '-1:gazeteerDocRef': gazeteerDocRef(word),
            '-1:lonleyInitial': matchesRegEx("[А-Я]\.", word1),
            '-1:singleChar': matchesRegEx("[А-Яа-я]", word1),
            '-1:fourDigitsYear': matchesRegEx("\d{4}", word1),
            '-1:gazeteerMoney': gazeteerMoney(word1),
            '-1:gazeteerDate': gazeteerDate(word1),
            '-1:gazeteerCourt': gazeteerCourt(word1),
            '-1:gazeteerLaw': gazeteerLaw(word1),
            '-1:hasQuote': matchesRegEx("[\"']", word1),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:ispunctuation': matchesRegEx("[,\.;:\?!-+\"]", word1),
            '+1:containsDash': matchesRegEx(".*-.*", word1),
            '+1:containsDot': matchesRegEx(".*\..*", word1),
            '+1:containsDigit': matchesRegEx(".*[0-9].*", word1),
            '+1:singleDigit': matchesRegEx("[0-9]", word1),
            '+1:endsWithDot': matchesRegEx(".*\.", word1),
            '+1:lonleyInitial': matchesRegEx("[А-Я]\.", word1),
            '+1:singleChar': matchesRegEx("[А-Яа-я]", word1),
            '+1:fourDigitsYear': matchesRegEx("\d{4}", word1),
            '+1:gazeteerMoney': gazeteerMoney(word),
            '+1:gazeteerJudge': gazeteerJudge(word),
            '+1:gazeteerCountry': gazeteerCountry(word),
            '+1:gazeteerInstitution': gazeteerInstitution(word),
            '+1:gazeteerDocRef': gazeteerDocRef(word),
            '+1:gazeteerMoney': gazeteerMoney(word1),
            '+1:gazeteerDate': gazeteerDate(word1),
            '+1:gazeteerCourt': gazeteerCourt(word1),
            '+1:gazeteerLaw': gazeteerLaw(word1),
            '+1:hasQuote': matchesRegEx("[\"']", word1)
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [13]:
sent = getter.get_next()
sent2features(sent)

[{'bias': 1.0,
  'word.lower()': 'и.',
  'word[:-2]': '',
  'word[:-1]': 'И',
  'word[-1:]': '.',
  'word[-2:]': 'И.',
  'ispunctuation': False,
  'containsDash': False,
  'containsDot': True,
  'containsDigit': False,
  'lonleyInitial': True,
  'singleChar': True,
  'singleDigit': False,
  'hasQuote': False,
  'endsWithDot': True,
  'fourDigitsYear': False,
  'gazeteerMoney': False,
  'gazeteerJudge': False,
  'gazeteerCountry': False,
  'gazeteerInstitution': False,
  'gazeteerDocRef': False,
  'gazeteerDate': False,
  'gazeteerCourt': False,
  'gazeteerLaw': False,
  'word.isupper()': True,
  'word.islower()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'BOS': True,
  '+1:word.lower()': 'с.',
  '+1:word.istitle()': True,
  '+1:word.isupper()': True,
  '+1:ispunctuation': False,
  '+1:containsDash': False,
  '+1:containsDot': True,
  '+1:containsDigit': False,
  '+1:singleDigit': False,
  '+1:endsWithDot': True,
  '+1:lonleyInitial': True,
  '+1:singleChar': True,
 

In [14]:
docs = getter.sentences
X = [sent2features(d) for d in docs]
y = [sent2labels(d) for d in docs]

In [37]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=1.9,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [16]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [38]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [39]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

        DATE       0.67      0.60      0.64       630
    LOC-ADDR       0.00      0.00      0.00       229
    LOC-CITY       0.67      0.40      0.50       207
    LOC-CTRY       0.72      0.77      0.74        86
     LOC-OTH       0.00      0.00      0.00        56
      METRIC       0.00      0.00      0.00        19
       MONEY       0.84      0.80      0.82       382
           O       0.95      0.97      0.96     66001
   ORG-CMPNY       0.61      0.42      0.50       340
   ORG-COURT       0.77      0.79      0.78       987
    ORG-INST       0.50      0.04      0.07        26
     PER-JDG       0.00      0.00      0.00        12
     PER-LWR       0.00      0.00      0.00        18
     PER-OTH       0.67      0.55      0.60       756
PER-PLANTIFF       0.00      0.00      0.00        23
     REF-DOC       0.74      0.55      0.63      1273
     REF-LAW       0.71      0.62      0.66      3108
        TIME       0.00    

In [40]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=1.9, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [20]:
test_text = "В обжалваното в настоящото производство въззивно решение съдът се е произнесъл по сходен казус. Уговорената в предварителния договор продажна цена на имота е 8 000лв., от които при подписването му са заплатени 500лв., а остатъкът е следвало да бъде заплатен „при оформяне на продажбата в нотариална форма, но не по-късно от 28.02.05г.”. При това продавачката е поела задължението да прехвърли собствеността в нотариална форма след като и бъде предложен остатъкът от цената „не по-късно от 28.02.05г.”. Уговорено е също, че ако купувачът не изпълни паричното си задължение от 7500лв. в уговорения срок, той губи правото да иска връщане на платените от него 500лв. / чл.VІ от договора/, а ако продавачката откаже да изпълни задължението си за оформяне на продажбата по нотариален ред, тя дължи на купувача 1000лв., представляващи получената сума по този договор и обезщетение за неизпълнение на това задължение, като купувачът има право да иска обявяване на договора за окончателен по реда на чл.19, ал.3 от ЗЗД /чл.VІІ от договора/."

In [41]:
test_text_features = sent2features([(t, None) for t in twt().tokenize(test_text)])
res = crf.predict([test_text_features])

In [36]:
list(zip(twt().tokenize(test_text), res[0]))

[('В', 'O'),
 ('обжалваното', 'O'),
 ('в', 'O'),
 ('настоящото', 'O'),
 ('производство', 'O'),
 ('въззивно', 'O'),
 ('решение', 'O'),
 ('съдът', 'O'),
 ('се', 'O'),
 ('е', 'O'),
 ('произнесъл', 'O'),
 ('по', 'O'),
 ('сходен', 'O'),
 ('казус.', 'O'),
 ('Уговорената', 'O'),
 ('в', 'O'),
 ('предварителния', 'O'),
 ('договор', 'O'),
 ('продажна', 'O'),
 ('цена', 'O'),
 ('на', 'O'),
 ('имота', 'O'),
 ('е', 'O'),
 ('8', 'O'),
 ('000лв.', 'O'),
 (',', 'O'),
 ('от', 'O'),
 ('които', 'O'),
 ('при', 'O'),
 ('подписването', 'O'),
 ('му', 'O'),
 ('са', 'O'),
 ('заплатени', 'O'),
 ('500лв.', 'O'),
 (',', 'O'),
 ('а', 'O'),
 ('остатъкът', 'O'),
 ('е', 'O'),
 ('следвало', 'O'),
 ('да', 'O'),
 ('бъде', 'O'),
 ('заплатен', 'O'),
 ('„', 'O'),
 ('при', 'O'),
 ('оформяне', 'O'),
 ('на', 'O'),
 ('продажбата', 'O'),
 ('в', 'O'),
 ('нотариална', 'O'),
 ('форма', 'O'),
 (',', 'O'),
 ('но', 'O'),
 ('не', 'O'),
 ('по-късно', 'O'),
 ('от', 'O'),
 ('28.02.05г.', 'O'),
 ('”', 'O'),
 ('.', 'O'),
 ('При', 'O'),
 ('т

In [23]:
# unannotated_data = []

# with jsonlines.open('test-annotate.jsonl') as reader:
#     for obj in reader:
#         unannotated_data.append(obj)

In [24]:
# import csv
# with open('predicted_annotations.csv', mode='w', encoding='utf-8') as results_file:
#     employee_writer = csv.writer(results_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     for k, doc in enumerate(unannotated_data):
#         if k == 500:
#             break
#         text = doc['text']
#         ann_text_features = sent2features([(t, None) for t in twt().tokenize(text)])
#         res = crf.predict([ann_text_features])

#         for token in list(zip(twt().tokenize(text), res[0])):
#             if token[1] != 'O':
#                 employee_writer.writerow([token[0], token[1]])
#             else:
#                 employee_writer.writerow([token[0], ""])
#         employee_writer.writerow([])

In [43]:
import pickle

pickle.dump( crf, open( "crf_model-with-reg.pkl", "wb" ) )

In [42]:
import eli5
eli5.show_weights(crf, top=30)

From \ To,DATE,LOC-ADDR,LOC-CITY,LOC-CTRY,LOC-OTH,METRIC,MONEY,O,ORG-CMPNY,ORG-COURT,ORG-INST,PER-JDG,PER-LWR,PER-OTH,PER-PLANTIFF,REF-DOC,REF-LAW,TIME
DATE,5.949,0.0,0.0,0.0,0.0,0.0,0.0,-0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LOC-ADDR,0.0,5.2,0.0,0.0,0.0,0.0,0.0,-1.349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LOC-CITY,0.0,0.0,5.204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LOC-CTRY,0.0,0.0,0.0,2.699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LOC-OTH,0.0,0.0,0.0,0.0,5.844,0.0,0.0,-0.208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
METRIC,0.0,0.0,0.0,0.0,0.0,5.365,0.0,-0.248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MONEY,0.0,0.0,0.0,0.0,0.0,0.0,5.144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O,0.21,-0.835,0.157,0.0,0.0,-0.099,0.0,3.095,-0.288,0.383,0.0,-0.0,-0.002,0.215,0.0,0.197,-0.822,-0.159
ORG-CMPNY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.329,4.911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ORG-COURT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875,0.0,5.744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17
+3.824,singleDigit,,,,,,,,,,,,,,,,
+1.632,+1:fourDigitsYear,,,,,,,,,,,,,,,,
+1.542,+1:gazeteerDate,,,,,,,,,,,,,,,,
+1.142,word[-2:]:г.,,,,,,,,,,,,,,,,
+1.130,-1:word.lower():м.,,,,,,,,,,,,,,,,
+1.002,word[-2:]:10,,,,,,,,,,,,,,,,
+0.859,word[-2:]:04,,,,,,,,,,,,,,,,
+0.831,word[-1:]:и,,,,,,,,,,,,,,,,
+0.798,gazeteerDate,,,,,,,,,,,,,,,,
+0.754,word[:-2]:20,,,,,,,,,,,,,,,,

Weight?,Feature
+3.824,singleDigit
+1.632,+1:fourDigitsYear
+1.542,+1:gazeteerDate
+1.142,word[-2:]:г.
+1.130,-1:word.lower():м.
+1.002,word[-2:]:10
+0.859,word[-2:]:04
+0.831,word[-1:]:и
+0.798,gazeteerDate
+0.754,word[:-2]:20

Weight?,Feature
+1.222,word[-1:]:/
+1.003,-1:word.lower():адрес
+0.935,containsDigit
+0.649,-1:gazeteerDate
+0.637,word[-1:]:7
+0.610,word[:-1]:
+0.512,-1:ispunctuation
+0.426,word.lower():град
+0.426,word[:-1]:гра
+0.406,word[:-2]:

Weight?,Feature
+2.133,-1:word.lower():място
+1.956,-1:word.lower():гр.
+1.748,-1:word.lower():от
+1.704,word[:-2]:Соф
+1.704,word[:-1]:Софи
+1.703,word.lower():софия
+1.623,+1:word.lower():населено
+1.604,-1:gazeteerDate
+0.865,+1:word.lower():.
+0.553,word.lower():]

Weight?,Feature
1.255,word[:-2]:Република
1.255,word[:-1]:Републикат
1.255,word.lower():републиката
1.067,+1:gazeteerCountry
1.067,-1:gazeteerCountry
1.067,gazeteerCountry
1.002,-1:word.lower():в
0.935,word[:-1]:Р
0.602,word[:-2]:Републи
0.602,word[:-1]:Републик

Weight?,Feature
+3.005,-1:word.lower():идентификатор
+1.414,-1:word.lower():„
+0.987,fourDigitsYear
+0.932,+1:word.lower():''
+0.932,+1:hasQuote
+0.792,+1:ispunctuation
+0.621,+1:word.lower():кккр
+0.560,-1:fourDigitsYear
+0.498,word.lower():кккр
+0.498,word[:-2]:КК

Weight?,Feature
1.487,-1:singleDigit
0.922,+1:word.lower():кв.
0.752,word[:-1]:кв
0.752,word.lower():кв.
0.713,word[:-2]:к
0.704,singleDigit
0.561,containsDot
0.561,endsWithDot
0.522,+1:word.lower():м.
0.366,+1:containsDot

Weight?,Feature
+3.514,+1:gazeteerMoney
+1.749,singleDigit
+1.140,+1:word.lower():.
+1.062,word[-1:]:/
+1.050,-1:word.lower():4788
+1.031,-1:fourDigitsYear
+0.982,+1:word.lower():лв.
+0.921,word[-2:]:42
+0.828,gazeteerMoney
+0.746,word.lower():лева

Weight?,Feature
+2.279,BOS
+2.266,+1:word.lower():решение
+2.260,word[-1:]:у
+2.199,+1:word.lower():[
+2.181,-1:word.lower():гпк
+1.900,word[-2:]:..
+1.850,+1:word.lower():„
+1.827,+1:word.lower():чл.
+1.729,word.lower():с
+1.718,-1:word.lower():съд

Weight?,Feature
+1.535,-1:word.lower():фирма
+1.156,-1:word.lower():с.
+1.104,+1:word.lower():фирма
+0.984,-1:word.lower():”
+0.901,-1:lonleyInitial
+0.799,+1:word.lower():на
+0.769,word[-1:]:о
+0.602,word.lower():„
+0.602,word[-2:]:„
+0.602,word[-1:]:„

Weight?,Feature
+2.153,word[-1:]:С
+1.965,-1:word.lower():съда
+1.874,gazeteerCourt
+1.582,-1:word.lower():ос
+1.360,containsDash
+1.237,word.lower():–
+1.237,word[-2:]:–
+1.192,word[:-2]:С
+1.145,word[-1:]:–
+1.136,-1:gazeteerCourt

Weight?,Feature
3.04,word[:-2]:Н
1.261,+1:gazeteerInstitution
1.261,gazeteerInstitution
1.261,-1:gazeteerInstitution
0.992,word[-1:]:о
0.825,-1:word.istitle()
0.669,-1:word.lower():на
0.104,-1:word.lower():европейския
0.07,singleChar
0.061,word[:-2]:Европейск

Weight?,Feature
2.6,-1:gazeteerCourt
1.94,-1:word.lower():съдията
0.927,+1:gazeteerDate
0.885,word[-2:]:ва
0.682,-1:word.istitle()
0.282,-1:word.lower():с.
0.254,word.istitle()
0.097,word[-1:]:.
0.092,-1:word.isupper()
0.039,word.isupper()

Weight?,Feature
3.782,-1:word.lower():адвокат
0.861,lonleyInitial
0.318,+1:lonleyInitial
0.236,word[-1:]:.
0.202,word[:-2]:
0.198,word.istitle()
0.185,word.isupper()
0.159,endsWithDot
0.159,containsDot
0.155,-1:endsWithDot

Weight?,Feature
+3.161,word[-2:]:ля
+1.723,+1:word.lower():при
+1.623,-1:word.lower():и
+1.270,+1:word.lower():членове
+1.255,-1:word.lower():секретаря
+1.227,-1:word.lower()::
+1.071,word[-2:]:Д
+1.067,word.lower():д
+1.033,-1:word.lower():а.
+0.958,word[-1:]:В

Weight?,Feature
2.569,word.lower():ищецът
1.272,word[-2:]:ът
1.195,word[-1:]:о
0.845,+1:word.lower():с.
0.805,word.lower():ищеца
0.805,word[:-2]:ище
0.805,word[:-1]:ищец
0.659,+1:lonleyInitial
0.631,word[-2:]:ца
0.614,word[:-1]:ищецъ

Weight?,Feature
+1.810,containsDigit
+1.737,-1:word.lower():по
+1.600,+1:containsDigit
+1.587,BOS
+1.583,word[-2:]:.о
+1.559,-1:word.lower():№215/2016
+1.229,word[:-1]:З
+1.066,-1:word.isupper()
+1.039,-1:word.lower():с
+1.012,+1:word.lower():акт

Weight?,Feature
+2.680,+1:word.lower():гпк.
+1.458,gazeteerLaw
+1.297,-1:word.lower():по
+1.249,word[:-1]:З
+1.193,-1:word.lower():с
+1.090,-1:word.lower():основание
+1.017,word[-2:]:24
+0.816,-1:word.lower():т.
+0.789,word[:-2]:чл.4
+0.715,+1:word.isupper()

Weight?,Feature
1.868,word[-2:]:ни
1.42,+1:gazeteerDate
0.683,gazeteerDate
0.432,word[-1:]:т
0.225,singleDigit
0.216,word.islower()
0.109,-1:word.lower():от
-0.01,endsWithDot
-0.01,containsDot
-0.026,singleChar
