In [39]:
import matplotlib.pyplot as plt
from nltk.parse import DependencyGraph
import networkx as nx
from itertools import chain

def generate_featuresets(annotations):
    def get_parent(word, sentence):
        for item in sentence:
            if item[0] == word[8]:
                return item
        return []
        
    def get_children(word, sentence):
         for item in sentence:
            if item[8] == word[0]:
                yield item
    
    def get_predicates(sentence):
        for word in sentence:
            if len(word)>1:
                if word[12] != "_":
                    yield word
                    
    def get_arguments(idx, predicate, sentence):
        arg = ""
        argdict = {}

        for word in sentence:
            try:
                arg = word[13+idx]
                if arg not in argdict:
                    argdict[arg] = [word]
                else:
                    argdict[arg].append(word)
                if arg != "":
                    if word[0] > predicate[0] and argdict[arg][0][0] < predicate[0]:
                        arg = ""
            except IndexError as e:
                print ("INFO: Predicate {} problem missing argument, skipping...".format(predicate[1]))
        return argdict
    
    def sentence_to_dict (sentence):
        dictionary = {}
        for word in sentence:
            dictionary[word[0]] = word
        return dictionary
    
    def sentence_to_graph (sentence):
        G=nx.Graph()
        for word in sentence:
            G.add_edge(word[0], word[9])
        return G
    
    def sentence_to_digraph (sentence):
        G=nx.DiGraph()
        for word in sentence:
            G.add_edge(word[9], word[0])
        return G
    
    
    def path_to_deprel (path, sentence_dict):
        predicate_idx = path[0][0]
        deprel = []
        for idx, item in enumerate(path[:-1], 1):
            first = item
            second = path[idx]
            direction = edge_direction((first, second), sentence_dict)
            label = sentence_dict[second][10]
            deprel.append ("{} {}".format(label, direction))
        return " ".join(deprel)
    
    def path_to_posrel (path, sentence_dict):
        predicate_idx = path[0][0]
        deprel = []
        for idx, item in enumerate(path[:-1], 1):
            first = item
            second = path[idx]
            direction = edge_direction((first, second), sentence_dict)
            label = sentence_dict[second][4]
            deprel.append ("{} {}".format(label, direction))
        return " ".join(deprel)
            
    def edge_direction (edge, sentence_dict):
        if sentence_dict[edge[0]][9] == edge[1]:
            return 'up'
        else:
            return 'down'
    def sentence_to_treebank(sentence):
        treebank = []
        for word in sentence:
            treebank.append("\t".join([word[1], word[4], word[9], word[10]]))
        return '\n'.join(treebank)
    
    def pprint_sentence (sentence):
        return ' '.join([word[1] for word in sentence]).replace(" ,", ",").replace(" .", ".").replace(" ?", "?").replace(" !", "?")
    for sentence in annotations:
        sentence_dict = sentence_to_dict(sentence)
        sentence_graph = sentence_to_graph(sentence)
        sentence_digraph = sentence_to_digraph(sentence)

        for idx, predicate in enumerate(get_predicates(sentence), 1):
            featuresets = []
            labels = []
            argdict = {}
            for argname, words in get_arguments(idx, predicate, sentence).items():
                for word in words:
                    argdict[word[0]] = argname
            for argument in sentence:
                if argument[0] != predicate[0]:
                    path = list(nx.shortest_path(sentence_graph, predicate[0], argument[0]))
                    featureset = {
                            'PredWord': predicate[1],
                            'PredPos': predicate[4],
                            'PredDeprel': predicate[10],
                            'PredFets': predicate[5],
                            'ArgWord':argument[1],
                            'ArgPos': argument[4],
                            'ArgDeprel': argument[10],
                            'ArgFets': argument[5],
                            'DeprelPath':path_to_deprel (path, sentence_dict),
                            'PosPath': path_to_posrel (path, sentence_dict),
                            'Position': int(argument[0]) > int(predicate[0]),
                    }

                    parent_word = get_parent(predicate, sentence)
                    if len(parent_word) > 0:
                        featureset['PredParentWord'] = parent_word[1]
                        featureset['PredParentPos'] = parent_word[4]
                        featureset['PredParentFeats'] = parent_word[6]

                    children_words = list(get_children(predicate, sentence))
                    if len(children_words)>0:
                        DepSubChildren = [predicate]
                        DepSubChildren.extend(children_words)

                        DepSubChildren = sorted(DepSubChildren, key=lambda x: x[0])
                        DepSubCat = ','.join([word[10] for word in DepSubChildren])
                        ChildDepSet = ','.join([word[10] for word in children_words])
                        ChildWordSet = ','.join([word[1] for word in children_words])
                        ChildPOSSet = ','.join([word[4] for word in children_words])

                        featureset['DepSubCat'] = DepSubCat
                        featureset['ChildDepSet'] = ChildDepSet
                        featureset['ChildWordSet'] = ChildWordSet
                        featureset['ChildPOSSet'] = ChildPOSSet

                    arg_parent = get_parent(argument, sentence)
                    if len(arg_parent) > 0:
                        children = list(get_children(arg_parent, sentence))
                        arg_children = list(get_children(argument, sentence))
                        
                        argument_index = children.index(argument)
                        if argument_index-1 > 0:
                            left = children[argument_index-1]
                            featureset['LeftSiblingWord'] = left[1]
                            featureset['LeftSiblingPos'] = left[4]
                            featureset['LeftSiblingFeats'] = left[5]
                        if argument_index+1 < len(children):
                            right = children[argument_index+1]
                            featureset['RightSiblingWord'] = right[1]
                            featureset['RightSiblingPos'] = right[4]
                            featureset['RightSiblingFeats'] = right[5]
                        if len(arg_children) > 1:
                            leftmost = arg_children[0]
                            featureset['LeftWord'] = leftmost[1]
                            featureset['LeftPos'] = leftmost[4]
                            featureset['LeftFeats'] = leftmost[5]

                            rightmost = arg_children[-1]
                            featureset['RightWord'] = rightmost[1]
                            featureset['RightPos'] = rightmost[4]
                            featureset['RightFeats'] = rightmost[3]
                    try:
                        labels.append(argument[13+idx] if argument[13+idx] != "_"  else "_")
                        featuresets.append (featureset)
                    except:
                        print ("INFO: Predicate {} has no arhuments in sentence {}".format(predicate[1], pprint_sentence(sentence)))
            yield featuresets, labels

In [40]:
def preprocess (filename):
    EOL = "\n"
    SEP = "\t"
    ETY = ""
    file = open(filename,"r")
    annotations = []
    annotation = []
    for line in file:
        parts = line.replace(EOL, ETY).split(SEP)
        if len(parts) > 1:
            annotation.append(parts)
        else:
            annotations.append(annotation)
            annotation = []
    print(annotations[1])
    print(list(generate_featuresets([annotations[1]])))
    return zip(*list(generate_featuresets(annotations)))

In [41]:
X_train, y_train = preprocess('data/hr.train.plus')

[['1', 'Kosovo', 'Kosovo', 'Kosovo', 'N', 'N', 'Type=proper|Gender=neuter|Number=singular|Case=nominative', 'Type=proper|Gender=neuter|Number=singular|Case=nominative', '3', '3', 'NSUBJ', 'NSUBJ', '_', '_', 'ACT'], ['2', 'ozbiljno', 'ozbiljno', 'ozbiljno', 'R', 'R', 'Type=general|Degree=positive', 'Type=general|Degree=positive', '3', '3', 'ADVMOD', 'ADVMOD', '_', '_', 'MANN'], ['3', 'analizira', 'analizirati', 'analizirati', 'V', 'V', 'Type=main|VForm=present|Person=third|Number=singular', 'Type=main|VForm=present|Person=third|Number=singular', '0', '0', 'ROOT', 'ROOT', 'Y', 'analizirati', '_'], ['4', 'proces', 'proces', 'proces', 'N', 'N', 'Type=common|Gender=masculine|Number=singular|Case=accusative|Animate=no', 'Type=common|Gender=masculine|Number=singular|Case=accusative|Animate=no', '3', '3', 'DOBJ', 'DOBJ', '_', '_', 'PAT'], ['5', 'privatizacije', 'privatizacija', 'privatizacija', 'N', 'N', 'Type=common|Gender=feminine|Number=singular|Case=genitive', 'Type=common|Gender=feminine|

In [24]:
crf.predict([X_train[0]])

[['ACT', 'MANN', 'PAT', '_', '_', 'REG', '_', '_', '_']]

In [45]:
X_test, y_test = preprocess('data/hr.test.plus')

[['1', 'Vjeruje', 'vjerovati', 'vjerovati', 'V', 'V', 'Type=main|VForm=present|Person=third|Number=singular', 'Type=main|VForm=present|Person=third|Number=singular', '0', '0', 'ROOT', 'ROOT', 'Y', 'vjerovati', '_', '_'], ['2', 'se', 'sebe', 'sebe', 'P', 'P', 'Type=reflexive|Number=singular|Case=accusative', 'Type=reflexive|Number=singular|Case=accusative', '1', '1', 'COMPOUND', 'COMPOUND', '_', '_', '_', '_'], ['3', 'da', 'da', 'da', 'C', 'C', 'Type=subordinating', 'Type=subordinating', '7', '7', 'MARK', 'MARK', '_', '_', '_', '_'], ['4', 'se', 'sebe', 'sebe', 'P', 'P', 'Type=reflexive|Number=singular|Case=accusative', 'Type=reflexive|Number=singular|Case=accusative', '7', '7', 'COMPOUND', 'COMPOUND', '_', '_', '_', '_'], ['5', 'u', 'u', 'u', 'S', 'S', 'Case=locative', 'Case=locative', '6', '6', 'CASE', 'CASE', '_', '_', '_', '_'], ['6', 'Poljskoj', 'Poljska', 'Poljska', 'N', 'N', 'Type=proper|Gender=feminine|Number=singular|Case=locative', 'Type=proper|Gender=feminine|Number=singular|

In [46]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.2009,
    c2=0.0284,
    max_iterations=100,
    all_possible_transitions=True
)

In [47]:
model = crf.fit(X_train, y_train)

In [48]:
y_pred = crf.predict(X_test)

In [50]:
import joblib
joblib.dump(crf, "ac.pkl") 

['ac.pkl']

In [49]:
from sklearn_crfsuite import metrics
labels = crf.classes_
labels.remove("_")
print(metrics.flat_classification_report(y_test, y_pred, digits=2, labels=labels))

              precision    recall  f1-score   support

         ACT       0.90      0.94      0.92       962
        MANN       0.57      0.53      0.55       100
         PAT       0.80      0.81      0.80      1052
         REG       0.50      0.32      0.39        47
       PHRAS       0.17      0.08      0.11        12
         REC       0.84      0.62      0.72       117
       RESLT       0.82      0.82      0.82       562
         LOC       0.45      0.61      0.52        97
       MODAL       0.81      0.98      0.89       115
        TIME       0.77      0.82      0.80       238
        ORIG       0.64      0.58      0.61        64
         DUR       0.78      0.49      0.60        88
       CAUSE       0.82      0.26      0.40        53
        FREQ       1.00      0.67      0.80        15
      SOURCE       1.00      0.18      0.31        11
         AIM       0.67      0.49      0.56        45
       QUANT       0.79      0.53      0.64        43
        COND       0.67    

In [9]:
from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(300))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-300:])

Top positive:
7.657479 TIME     LeftWord:nakon
7.425624 _        ArgFets:Type=participle
7.030543 LOC      ArgFets:Type=proper|Gender=feminine|Number=singular|Case=locative
7.011836 CAUSE    LeftWord:zbog
6.914578 TIME     ArgWord:sada
6.817562 TIME     LeftWord:prije
6.699139 DUR      ArgWord:uvijek
6.684276 DUR      ArgWord:dalje
6.537859 _        RightSiblingWord:suverenitet
6.337272 RESTR    LeftWord:Osim
6.190760 PAT      DeprelPath:PARATAXIS up
6.184445 _        DeprelPath:AUXPASS down
6.180242 MEANS    LeftWord:kroz
6.142283 MANN     ArgWord:način
6.127979 PAT      DeprelPath:CSUBJPASS down COP down
5.952270 TIME     ArgWord:srijedu
5.932529 QUANT    ArgWord:Toliko
5.889607 RESLT    ArgWord:NATO-a
5.757063 TIME     ArgWord:ponedjeljak
5.747483 _        ArgWord:također
5.697885 _        PredPos:N
5.695911 DUR      ArgWord:dugo
5.673933 RESLT    DeprelPath:CSUBJPASS down COP down
5.641713 PAT      ArgWord:gubitaka
5.615423 TIME     ArgWord:utorak
5.591982 RESLT    ArgWord:Chevron


In [9]:
def custom_flatten (X, y):
    data = [feature for sentence in X for feature in sentence]
    labels = [label for sentence in y for label in sentence]
    return data, labels

train_data, train_labels = custom_flatten (X_train, y_train)
test_data, test_labels = custom_flatten (X_test, y_test)

In [10]:
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
#from sklearn.svm import LinearSVC

vec = DictVectorizer(sparse=True)
enc = preprocessing.LabelEncoder()

clf = LogisticRegression(solver='liblinear')

X = vec.fit_transform(train_data)
y = enc.fit_transform(train_labels)
clf.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
test_data = vec.transform(test_data)
pred_labels = clf.predict(test_data)
pred_labels = enc.inverse_transform(pred_labels)

  if diff:


In [15]:
from sklearn.metrics import classification_report
labels = list(enc.classes_)
labels.remove("_")

filtered = []
filtered_pred = []

for idx, label in enumerate(test_labels):
    if label != "_":
        filtered.append (label)
        filtered_pred.append(pred_labels[idx])
print(classification_report(filtered, filtered_pred, target_names=labels, digits=2))

             precision    recall  f1-score   support

       ACMP       0.73      0.48      0.58        23
        ACT       0.91      0.95      0.93       962
        AIM       0.64      0.20      0.31        45
      CAUSE       0.40      0.08      0.13        53
       COND       0.60      0.23      0.33        13
      CONTR       0.67      0.14      0.24        14
        DUR       0.71      0.27      0.39        88
      EVENT       0.59      0.50      0.54        26
       FREQ       1.00      0.33      0.50        15
       GOAL       0.62      0.19      0.29        43
        LOC       0.52      0.62      0.57        97
       MANN       0.60      0.43      0.50       100
      MEANS       0.44      0.57      0.50        21
      MODAL       0.80      0.98      0.88       115
     MWPRED       0.91      0.29      0.43        35
       ORIG       0.75      0.42      0.54        64
        PAT       0.82      0.79      0.80      1052
      PHRAS       0.00      0.00      0.00   

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
