In [None]:
import glob
import string
import fileinput
import pandas as pd
import numpy as np
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

## Define features

In [None]:
def word2features(sent, i):
    word = sent[i][1]
    vector = sent[i][3:102]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[5:]': word[5:],
        'word[-5:]': word[-5:],
        'word.isdigit()': word.isdigit(),
        'vector': vector,
    }
    if i > 0:
        word1 = sent[i-1][1]
        vector1 = sent[i-1][3:102]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word[5:]': word1[5:],
            '-1:word[-5:]': word1[-5:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1vector': vector1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][1]
        vector1 = sent[i+1][3:102]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word[5:]': word1[5:],
            '+1:word[-5:]': word1[-5:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1vector': vector1,
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent): #use version of sent without vectors
    return [label for nr, token, label in sent]

def sent2tokens(sent): #use version of sent without vectors
    return [token for nr, token, label in sent]

## Prepare train set

In [None]:
features_train = pd.read_csv('MERGED_TOKEN-LABEL-VECTOR-FILE_TRAIN.csv')
features_train.head()

In [None]:
features_train['label'] = features_train['label'].fillna('O')

In [None]:
features_train.head()

In [None]:
features_train = features_train.drop('attribute', 1)

In [None]:
features_train['vector'] = features_train['vector'].str.replace('[', '')
features_train['vector'] = features_train['vector'].str.replace(']', '')
features_train['vector'] = features_train['vector'].str.strip()
features_train['vector'] = features_train['vector'].str.replace('\n', ' ')
features_train.vector = features_train.vector.replace('\s+', ' ', regex=True)

In [None]:
features_train = pd.concat([features_train[['token', 'label']], features_train['vector'].str.split(' ', expand=True)], axis=1)

In [None]:
features_train.head()

In [None]:
features_train.to_csv('MERGED_TOKEN-LABEL-VECTOR-FILE_features_train.csv')

In [None]:
from csv import reader

with open('MERGED_TOKEN-LABEL-VECTOR-FILE_features_train.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    sent = list(csv_reader)

In [None]:
sent.pop(0)

In [None]:
train_sents = [sent]

In [None]:
sent2 = [item[0:3] for item in sent]

In [None]:
train_sents2 = [sent2]

In [None]:
X_train = [sent2features(s) for s in train_sents]
Y_train = [sent2labels(s) for s in train_sents2]

### Split vectors in 100 features

In [None]:
for item in X_train:
    for word in item:
        word_vector = word['vector']
        for i, element in enumerate(word_vector):
            word['vector{}'.format(i+1)] = element
        word.pop('vector')
        if '-1vector' in word:
            word_vector_prev = word['-1vector']
            for i, element in enumerate(word_vector_prev):
                word['-1vector{}'.format(i+1)] = element
            word.pop('-1vector')
        else:
            continue
            
        if '+1vector' in word:
            word_vector_next = word['+1vector']
            for i, element in enumerate(word_vector_next):
                word['+1vector{}'.format(i+1)] = element
            word.pop('+1vector')
        else:
            continue

## Prepare test set

In [None]:
features_test = pd.read_csv('MERGED_TOKEN-LABEL-VECTOR-FILE_TRAIN.csv')
features_test.head()

In [None]:
features_test['label'] = features_test['label'].fillna('O')

In [None]:
features_test.head()

In [None]:
features_test = features_test.drop('attribute', 1)

In [None]:
features_test['vector'] = features_test['vector'].str.replace('[', '')
features_test['vector'] = features_test['vector'].str.replace(']', '')
features_test['vector'] = features_test['vector'].str.strip()
features_test['vector'] = features_test['vector'].str.replace('\n', ' ')
features_test.vector = features_test.vector.replace('\s+', ' ', regex=True)

In [None]:
features_test = pd.concat([features_test[['token', 'label']], features_test['vector'].str.split(' ', expand=True)], axis=1)

In [None]:
features_test.to_csv('MERGED_TOKEN-LABEL-VECTOR-FILE_features_test.csv')

In [None]:
with open('MERGED_TOKEN-LABEL-VECTOR-FILE_features_test.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    sent3 = list(csv_reader)

In [None]:
sent3.pop(0)

In [None]:
test_sents = [sent3]

In [None]:
sent4 = [item[0:3] for item in sent3]

In [None]:
test_sents2 = [sent4]

In [None]:
X_test = [sent2features(s) for s in test_sents]
Y_test = [sent2labels(s) for s in test_sents2]

### Split vectors into 100 features

In [None]:
for item in X_test:
    for word in item:
        word_vector = word['vector']
        for i, element in enumerate(word_vector):
            word['vector{}'.format(i+1)] = element
        word.pop('vector')
        if '-1vector' in word:
            word_vector_prev = word['-1vector']
            for i, element in enumerate(word_vector_prev):
                word['-1vector{}'.format(i+1)] = element
            word.pop('-1vector')
        else:
            continue
            
        if '+1vector' in word:
            word_vector_next = word['+1vector']
            for i, element in enumerate(word_vector_next):
                word['+1vector{}'.format(i+1)] = element
            word.pop('+1vector')
        else:
            continue

## Train CRF-model

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, Y_train)

## Evaluate the model

In [None]:
labels = list(crf.classes_)
labels.remove('O')

In [None]:
Y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test, Y_pred,
                      average='weighted', labels=labels)

In [None]:
print(metrics.flat_classification_report(
    Y_test, Y_pred, labels=labels, digits=3
))

In [None]:
%%time #Not working if n_samples = 1
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, Y_train)

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

## Merge label and predicted label

In [None]:
from itertools import chain

new_Y_pred = list(chain.from_iterable(Y_pred))

In [None]:
result = [a+[x] for a,x in zip(sent4,new_Y_pred)]

In [None]:
df = pd.DataFrame(result, columns=['nr', 'token', 'label', 'pred']).drop('nr', 1)

In [None]:
df.to_csv('TEST_PRED.csv')