In [1]:
EOL = "\n"
SEP = "\t"
ETY = ""

In [2]:
def generate_featuresets (annotations):
    def get_parent(word, sentence):
        for item in sentence:
            if item[0] == word[8]:
                return item
        return []
        
    def get_children(word, sentence):
         for item in sentence:
            if item[8] == word[0]:
                yield item
    
    def feats_to_dict (feature):
        dict = {
            "Type": "",
            "Degree" : "",
            "Gender": "",
            "Number": "",
            "Case": "",
            "Definiteness": "", 
            "VForm": "",
            "Person": ""
        }
        for item in feature.split("|"):
            if "=" in item:
                label, value = item.split("=")
                dict[label] = value
        return dict
    
    for sentence in annotations:
        sentence_length = len(sentence)
        featuresets = []
        labels = []
        for word in sentence:
            word_feats = feats_to_dict(word[6])
            
            featureset = {
                'BOS': False,
                'EOS': False,
                'PredWord': word[2].lower(),
                'PredPos': word[4],
                'PredDeprel': word[10],
                'PredType': word_feats['Type'],
                'PredDegree': word_feats['Degree'],
                'PredGender': word_feats['Gender'],
                'PredNumber': word_feats['Number'],
                'PredCase': word_feats['Case'],
                'PredDefiniteness': word_feats['Definiteness'],
                'PredVForm': word_feats['VForm'],
                'PredPerson': word_feats['Person'],
                #'PredFeats': word[6],
                #'PredParentWord': "",
                #'PredParentPos': "",
                #'PredParentFeats': "",
                #'DepSubCat': "",
                #'ChildDepSet': "",
                #'ChildWordSet': "",
                #'ChildPOSSet': ""
            }
            
            if int(word[0]) == 1:
                featureset['BOS'] = True
            if int(word[0]) == sentence_length:
                featureset['EOS'] = True
            parent_word = get_parent(word, sentence)
            
            if len(parent_word) > 0:
                
                parent_feats = feats_to_dict(parent_word[6])
                
                featureset['PredParentWord'] = parent_word[2]
                featureset['PredParentPos'] = parent_word[4]
                featureset['PredParentFeats'] = parent_word[6]
                
                featureset['PredParentType']= parent_feats['Type']
                featureset['PredParentDegree'] =parent_feats['Degree']
                featureset['PredParentGender'] = parent_feats['Gender']
                featureset['PredParentNumber'] = parent_feats['Number']
                featureset['PredParentCase'] = parent_feats['Case']
                featureset['PredParentDefiniteness'] = parent_feats['Definiteness']
                featureset['PredParentVForm'] = parent_feats['VForm']
                featureset['PredParentPerson'] = parent_feats['Person']
                featureset['PredParentDeprel'] = parent_word[10] 
                
                
            children_words = list(get_children(word, sentence))
            if len(children_words)>0:
                DepSubChildren = [word]
                DepSubChildren.extend(children_words)
                
                DepSubChildren = sorted(DepSubChildren, key=lambda x: x[0], reverse=True)
                DepSubCat = ','.join([word[10] for word in DepSubChildren])
                ChildDepSet = ','.join([word[10] for word in children_words])
                ChildWordSet = ','.join([word[2] for word in children_words])
                ChildPOSSet = ','.join([word[4] for word in children_words])
                ChildCaseSet = ','.join([feats_to_dict(word[6])['Case'] for word in children_words])
                ChildTypeSet = ','.join([feats_to_dict(word[6])['Type'] for word in children_words])
                
                featureset['DepSubCat'] = DepSubCat
                featureset['ChildDepSet'] = ChildDepSet
                featureset['ChildWordSet'] = ChildWordSet
                featureset['ChildPOSSet'] = ChildPOSSet
                featureset['ChildCaseSet'] = ChildCaseSet
                featureset['ChildTypeSet'] = ChildTypeSet
            featuresets.append (featureset)
            labels.append("Y" if word[13] != "_"  else "_")
        yield featuresets, labels

In [3]:
file = open("data/hr.train.plus","r")
annotations = []
annotation = []
i = 0
for line in file:
    parts = line.replace(EOL, ETY).split(SEP)
    if len(parts) > 1:
        annotation.append(parts)
    else:
        annotations.append(annotation)
        annotation = []
    i = i + 1 
print ((i))
X_train, y_train = zip(*list(generate_featuresets(annotations)))

69998


In [4]:
file = open("data/hr.test.plus","r")
test_annotations = []
test_annotation = []
j = 0
for line in file:
    parts = line.replace(EOL, ETY).split(SEP)
    if len(parts) > 1:
        test_annotation.append(parts)
    else:
        test_annotations.append(test_annotation)
        test_annotation = []
    j = j + 1
print (j)
X_test, y_test = zip(*list(generate_featuresets(test_annotations)))

17389


In [5]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.2009,
    c2=0.0284,
    max_iterations=100,
    all_possible_transitions=True,
    all_possible_states = True
)

model = crf.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)

In [7]:
from sklearn_crfsuite import metrics
labels = list(model.classes_)
print(metrics.flat_classification_report(y_test, y_pred, digits=4, labels=labels))

              precision    recall  f1-score   support

           _     0.9958    0.9948    0.9953     14614
           Y     0.9627    0.9698    0.9662      2021

    accuracy                         0.9918     16635
   macro avg     0.9792    0.9823    0.9808     16635
weighted avg     0.9918    0.9918    0.9918     16635



In [9]:
from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(300))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-300:])

Top positive:
4.289948 Y        PredWord:podrijetlo
3.282619 _        ChildDepSet:MARK,COMPOUND
3.248193 Y        PredWord:spreman
3.178922 Y        PredWord:zainteresiran
3.055432 Y        PredWord:zaključivanje
2.923477 _        ChildDepSet:MARK
2.865765 _        PredParentWord:dirnuti
2.841478 Y        PredWord:rumunjska
2.794446 Y        PredWord:povezan
2.779790 Y        PredWord:bukurešt
2.761413 Y        ChildWordSet:zbog,spor,plan
2.701377 _        PredParentWord:uprti
2.638183 _        PredWord:htjeti
2.608619 _        DepSubCat:DOBJ,CSUBJ
2.585690 Y        PredParentWord:nastati
2.568885 _        ChildWordSet:što,pristati
2.468990 Y        PredType:participle
2.393732 Y        ChildWordSet:oštro
2.369380 _        PredParentWord:odan
2.354833 _        ChildDepSet:MARK,AUX
2.345443 _        PredPos:P
2.314798 _        PredParentWord:razasuti
2.299645 _        PredDeprel:AUX
2.254010 Y        PredWord:voljan
2.231269 _        ChildWordSet:,,kako,sebe,,
2.210735 _        PredPare

In [383]:
from sklearn.externals import joblib
joblib.dump(model, 'pi.pkl') 

['pi.pkl']

In [359]:
def custom_flatten (X, y):
    data = [feature for sentence in X for feature in sentence]
    labels = [label for sentence in y for label in sentence]
    return data, labels
X_train, y_train = custom_flatten (X_train, y_train)
X_test, y_test = custom_flatten (X_test, y_test)

In [360]:
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
#from sklearn.svm import LinearSVC

vec = DictVectorizer(sparse=True)
enc = preprocessing.LabelEncoder()

clf = LogisticRegression(solver='lbfgs')

X = vec.fit_transform(X_train)
y = enc.fit_transform(y_train)
clf.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [361]:
X_test = vec.transform(X_test)
y_pred = clf.predict(X_test)
y_pred = enc.inverse_transform(y_pred)

  if diff:


In [362]:
from sklearn.metrics import classification_report
labels = list(enc.classes_)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

             precision    recall  f1-score   support

          Y       0.96      0.97      0.96      2021
          _       1.00      0.99      1.00     14614

avg / total       0.99      0.99      0.99     16635

