In [1]:
import glob
import string
import fileinput
import pandas as pd
import numpy as np
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

## Define features

In [2]:
def word2features(sent, i):
    word = sent[i][1]
    vector = sent[i][3:102]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[5:]': word[5:],
        'word[-5:]': word[-5:],
        'word.isdigit()': word.isdigit(),
        'vector': vector,
    }
    if i > 0:
        word1 = sent[i-1][1]
        vector1 = sent[i-1][3:102]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word[5:]': word1[5:],
            '-1:word[-5:]': word1[-5:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1vector': vector1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][1]
        vector1 = sent[i+1][3:102]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word[5:]': word1[5:],
            '+1:word[-5:]': word1[-5:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1vector': vector1,
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent): #use version of sent without vectors
    return [label for nr, token, label in sent]

def sent2tokens(sent): #use version of sent without vectors
    return [token for nr, token, label in sent]

## Prepare train set

In [2]:
features_train = pd.read_csv('../output/alle_Brug_Wall/alle_Brug_Wall_train.csv')
features_train.head()

Unnamed: 0,token,label,attribute,vector
0,merckenweerdigste,,,[ 0.1705316 -0.0306047 -0.00223772 0.159865...
1,voorvallen,,,[ 0.21780282 0.66156584 -0.3282248 0.618386...
2,en,,,[-0.12152633 -0.34552127 0.6551932 0.300286...
3,daegelijcksche,,,[ 1.8886597e-01 -8.9977644e-02 -2.6038742e-01 ...
4,gevallen,,,[ 0.07864238 0.2405637 -0.41845563 0.336337...


In [3]:
features_train['label'] = features_train['label'].fillna('O')

In [4]:
features_train.head()

Unnamed: 0,token,label,attribute,vector
0,merckenweerdigste,O,,[ 0.1705316 -0.0306047 -0.00223772 0.159865...
1,voorvallen,O,,[ 0.21780282 0.66156584 -0.3282248 0.618386...
2,en,O,,[-0.12152633 -0.34552127 0.6551932 0.300286...
3,daegelijcksche,O,,[ 1.8886597e-01 -8.9977644e-02 -2.6038742e-01 ...
4,gevallen,O,,[ 0.07864238 0.2405637 -0.41845563 0.336337...


In [10]:
features_train = features_train.drop('attribute', 1)

In [22]:
features_train['vector'] = features_train['vector'].str.replace('[', '')
features_train['vector'] = features_train['vector'].str.replace(']', '')
features_train['vector'] = features_train['vector'].str.strip()
features_train['vector'] = features_train['vector'].str.replace('\n', ' ')
features_train.vector = features_train.vector.replace('\s+', ' ', regex=True)

In [23]:
features_train = pd.concat([features_train[['token', 'label']], features_train['vector'].str.split(' ', expand=True)], axis=1)

In [24]:
features_train.head()

Unnamed: 0,token,label,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,merckenweerdigste,O,0.1705316,-0.0306047,-0.00223772,0.15986566,0.25695056,0.7164696,-0.2849065,-0.28343752,...,0.5931331,0.50308883,0.56332666,0.3110606,-0.3533712,-0.16419436,-0.5641507,0.26314676,0.6142619,-1.2206461
1,voorvallen,O,0.21780282,0.66156584,-0.3282248,0.61838645,0.44451284,0.27286738,-0.42026988,-0.39359006,...,0.32938528,0.2798662,0.779968,0.26112333,-0.28769144,-0.22493325,-0.33288112,0.36485076,0.20955795,-0.8348344
2,en,O,-0.12152633,-0.34552127,0.6551932,0.30028614,0.28003985,-0.4232091,-0.17500727,-0.10477021,...,0.44797343,-0.11305293,-0.36007732,-0.15637235,-0.48208672,-0.06710143,0.57227874,-0.21997964,0.10103169,-0.37744886
3,daegelijcksche,O,0.18886597,-0.089977644,-0.26038742,0.33972451,0.26794219,0.7117458,-0.53982091,0.15207386,...,0.35904276,0.092466295,0.5689435,0.36448163,-0.068343125,-0.037124392,-0.42610043,0.24748258,0.27695256,-1.1017174
4,gevallen,O,0.07864238,0.2405637,-0.41845563,0.33633724,0.5995671,0.08893785,-0.20334555,-0.52136946,...,0.3153123,0.05954508,0.2579233,-0.05005457,-0.55560845,-0.42802623,0.25179183,-0.12036338,0.13740036,-0.9038651


In [25]:
features_train.to_csv('../output/alle_Brug_Wall/alle_Brug_Wall_features_train.csv')

In [3]:
from csv import reader

with open('../output/alle_Brug_Wall/alle_Brug_Wall_features_train.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    sent = list(csv_reader)

In [4]:
sent.pop(0)

['',
 'token',
 'label',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99']

In [5]:
train_sents = [sent]

In [6]:
sent2 = [item[0:3] for item in sent]

In [7]:
train_sents2 = [sent2]

In [66]:
X_train = [sent2features(s) for s in train_sents]
Y_train = [sent2labels(s) for s in train_sents2]

### Split vectors in 100 features

In [67]:
for item in X_train:
    for word in item:
        word_vector = word['vector']
        for i, element in enumerate(word_vector):
            word['vector{}'.format(i+1)] = element
        word.pop('vector')
        if '-1vector' in word:
            word_vector_prev = word['-1vector']
            for i, element in enumerate(word_vector_prev):
                word['-1vector{}'.format(i+1)] = element
            word.pop('-1vector')
        else:
            continue
            
        if '+1vector' in word:
            word_vector_next = word['+1vector']
            for i, element in enumerate(word_vector_next):
                word['+1vector{}'.format(i+1)] = element
            word.pop('+1vector')
        else:
            continue

## Prepare test set

In [117]:
features_test = pd.read_csv('../output/alle_Brug_Wall/alle_Brug_Wall_test.csv')
features_test.head()

Unnamed: 0,token,label,attribute,vector
0,zijnde,,,[ 0.17457809 -0.06926309 -0.12756872 0.019270...
1,verders,,,[ 0.14125384 0.06924433 0.18649764 0.143644...
2,vrugteloos,,,[-0.0497899 0.2280807 0.04021712 -0.037615...
3,der,,,[ 9.36951023e-04 -1.80363223e-01 -8.54692310e-...
4,verciercelen,,,[ 0.32109752 0.01169947 0.23957802 -0.057510...


In [118]:
features_test['label'] = features_test['label'].fillna('O')

In [119]:
features_test.head()

Unnamed: 0,token,label,attribute,vector
0,zijnde,O,,[ 0.17457809 -0.06926309 -0.12756872 0.019270...
1,verders,O,,[ 0.14125384 0.06924433 0.18649764 0.143644...
2,vrugteloos,O,,[-0.0497899 0.2280807 0.04021712 -0.037615...
3,der,O,,[ 9.36951023e-04 -1.80363223e-01 -8.54692310e-...
4,verciercelen,O,,[ 0.32109752 0.01169947 0.23957802 -0.057510...


In [120]:
features_test = features_test.drop('attribute', 1)

In [121]:
features_test['vector'] = features_test['vector'].str.replace('[', '')
features_test['vector'] = features_test['vector'].str.replace(']', '')
features_test['vector'] = features_test['vector'].str.strip()
features_test['vector'] = features_test['vector'].str.replace('\n', ' ')
features_test.vector = features_test.vector.replace('\s+', ' ', regex=True)

In [122]:
features_test = pd.concat([features_test[['token', 'label']], features_test['vector'].str.split(' ', expand=True)], axis=1)

In [124]:
features_test.to_csv('../output/alle_Brug_Wall/alle_Brug_Wall_features_test.csv')

In [9]:
with open('../output/alle_Brug_Wall/alle_Brug_Wall_features_test.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    sent3 = list(csv_reader)

In [10]:
sent3.pop(0)

['',
 'token',
 'label',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99']

In [11]:
test_sents = [sent3]

In [12]:
sent4 = [item[0:3] for item in sent3]

In [13]:
test_sents2 = [sent4]

In [73]:
X_test = [sent2features(s) for s in test_sents]
Y_test = [sent2labels(s) for s in test_sents2]

### Split vectors into 100 features

In [74]:
for item in X_test:
    for word in item:
        word_vector = word['vector']
        for i, element in enumerate(word_vector):
            word['vector{}'.format(i+1)] = element
        word.pop('vector')
        if '-1vector' in word:
            word_vector_prev = word['-1vector']
            for i, element in enumerate(word_vector_prev):
                word['-1vector{}'.format(i+1)] = element
            word.pop('-1vector')
        else:
            continue
            
        if '+1vector' in word:
            word_vector_next = word['+1vector']
            for i, element in enumerate(word_vector_next):
                word['+1vector{}'.format(i+1)] = element
            word.pop('+1vector')
        else:
            continue

## Train CRF-model

In [76]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, Y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## Evaluate the model

In [77]:
labels = list(crf.classes_)
labels.remove('O')

In [78]:
Y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test, Y_pred,
                      average='weighted', labels=labels)

0.5554392098430706

In [79]:
print(metrics.flat_classification_report(
    Y_test, Y_pred, labels=labels, digits=3
))

                  precision    recall  f1-score   support

    waarneming-B      0.643     0.552     0.594       134
     ontvanger-B      0.828     0.578     0.681        83
informatiebron-B      0.513     0.488     0.500       121
informatiebron-I      0.670     0.448     0.537       145
    waarneming-I      0.500     0.143     0.222         7
     ontvanger-I      0.125     0.111     0.118         9

       micro avg      0.628     0.497     0.555       499
       macro avg      0.547     0.387     0.442       499
    weighted avg      0.639     0.497     0.555       499



In [190]:
%%time #Not working if n_samples = 1
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, Y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


ValueError: Cannot have number of splits n_splits=3 greater than the number of samples: n_samples=1.

In [80]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
O      -> O       4.163518
ontvanger-I -> ontvanger-I 4.029864
ontvanger-B -> ontvanger-I 3.232955
informatiebron-I -> informatiebron-I 2.888268
waarneming-I -> waarneming-I 2.374591
informatiebron-B -> informatiebron-I 1.727776
waarneming-B -> waarneming-I 1.091248
waarneming-B -> ontvanger-B 0.940000
O      -> informatiebron-B 0.650229
ontvanger-B -> waarneming-B 0.183771
informatiebron-B -> waarneming-B 0.168950
waarneming-I -> O       0.033862
ontvanger-I -> ontvanger-B -0.228920
waarneming-I -> waarneming-B -0.267929
waarneming-B -> O       -0.269331
O      -> waarneming-B -0.335387
ontvanger-I -> waarneming-B -0.439106
informatiebron-I -> waarneming-I -0.443954
informatiebron-I -> waarneming-B -0.472384
ontvanger-B -> ontvanger-B -0.483306

Top unlikely transitions:
O      -> ontvanger-B -0.816157
waarneming-I -> informatiebron-I -0.817049
informatiebron-I -> ontvanger-B -0.943036
informatiebron-B -> ontvanger-I -0.977838
informatiebron-I -> O       -0.982

In [81]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
1.988054 informatiebron-I +1:word.isdigit()
1.744118 waarneming-B word[-5:]:lesen
1.472060 waarneming-B word[-5:]:ndigt
1.466354 O        bias
1.145304 informatiebron-B -1:word[5:]:nde
0.997835 O        -1:word[5:]:r
0.985641 informatiebron-B word[5:]:he
0.965363 waarneming-B word[5:]:men
0.886319 informatiebron-B word[-5:]:bulle
0.853099 informatiebron-B word[-5:]:ntiën
0.840574 informatiebron-B -1:word[-5:]:enden
0.834066 O        +1:word[5:]:den
0.808228 O        word[-5:]:heden
0.805295 waarneming-B word[-5:]:reijt
0.791764 informatiebron-I -1:word[-5:]:ijcke
0.786695 informatiebron-B word[-5:]:chten
0.763428 waarneming-B word[-5:]:hoort
0.732903 O        word[5:]:
0.726139 O        -1:word[-5:]:aegen
0.695841 O        word[-5:]:huijs
0.694459 O        word[-5:]:orden
0.692423 O        word[-5:]:elijk
0.675154 informatiebron-B word[-5:]:iften
0.670968 waarneming-B -1:word[-5:]:erael
0.667475 informatiebron-I -1:word[-5:]:ebode
0.647987 informatiebron-B -1:word[-5:]:ee

## Merge label and predicted label

In [82]:
from itertools import chain

new_Y_pred = list(chain.from_iterable(Y_pred))

In [83]:
result = [a+[x] for a,x in zip(sent4,new_Y_pred)]

In [84]:
df = pd.DataFrame(result, columns=['nr', 'token', 'label', 'pred']).drop('nr', 1)

In [85]:
df.to_csv('test_pred.csv')