In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Eventually, for Anaconda warnings.
# Can be commented out.
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [4]:
# Load basic libraries
import seaborn; seaborn.set()
import pickle, copy, json
import numpy as np
import scipy.stats
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split
from sklearn.externals import joblib
from sklearn_crfsuite import scorers, metrics
import sklearn_crfsuite
from multiprocessing import Pool

In [None]:
# rememebr to save this dataset from before!
data = pickle.load(open("dataset/data.p", "rb"))
print(len(data))

In [None]:
# Generic Tagged BE Tag consolidation
correspondances = {
     'b-primary-full': 'b-primary', 
     'i-primary-full': 'i-primary', 
     'e-primary-full': 'e-primary', 
     'b-primary-partial': 'b-primary', 
     'i-primary-partial': 'i-primary', 
     'e-primary-partial': 'e-primary', 
     'b-meta-annotation': 'b-meta-annotation', 
     'i-meta-annotation': 'i-meta-annotation', 
     'e-meta-annotation': 'e-meta-annotation', 
     'b-secondary-full': 'b-secondary', 
     'i-secondary-full': 'i-secondary', 
     'e-secondary-full': 'e-secondary', 
     'b-secondary-partial': 'b-secondary', 
     'i-secondary-partial': 'i-secondary', 
     'e-secondary-partial': 'e-secondary', 
     'o': 'o', 
}
# define supporting functions
window = 2
from code.feature_extraction_words import word2features, generate_featuresLight
def text2features(text):
    return [word2features(text, i, window = window) for i in range(len(text))]
def text2featuresL(text):
    return [word2features(text, i, window = window, feature_function=generate_featuresLight) for i in range(len(text))]
# With extra Specifc Tags. Adding specific tags improves performances
def text2featuresEX(text, extra_labels):
    return [word2features(text, i, extra_labels, window = window) for i in range(len(text))]
def text2featuresLEX(text, extra_labels):
    return [word2features(text, i, extra_labels, window = window, feature_function=generate_featuresLight) for i in range(len(text))]

# create generic tags Y
def text2labelsG(text):
    return [correspondances[token[2][0]] for token in text]

# create beginend tags Y
def text2labelsBE(text):
    return [token[2][2] for token in text]

# create tagged-beginend tags Y
def text2labelsTBE(text):
    return [correspondances[token[2][3]] for token in text]

# create specific tags Y
def text2labelsS(text):
    return [correspondances[token[2][1]] for token in text]

In [None]:
# prepare data for CRF
annotated_data = list()
annotated_labels = list()
for doc in data:
    ar_data_ann = list()
    ar_labels_ann = list()
    for page in doc["pages"].values():
        if page["is_annotated"]:
            ar_data_ann.extend(page["offsets"])
            ar_labels_ann.extend(page["specific_tags"])
    if len(ar_data_ann) > 0:
        annotated_data.append(ar_data_ann)
        annotated_labels.append(ar_labels_ann)
print(len(annotated_data))
print(len(data))

In [None]:
# Define train and test sets for experiments
%%time
d = [text2featuresEX(text, lab) for text, lab in zip(annotated_data, annotated_labels)]
l = [text2labelsTBE(text) for text in annotated_data]
# Clean tag space
labels_to_keep = sorted(list(set([x for y in l for x in y])))
# VALIDATION set
X_rest, X_valid, y_rest, y_valid = train_test_split(d, l, test_size=0.1)
# TRAIN/TEST
X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=0.25)

In [None]:
# Count labels
counts = {x:0 for x in labels_to_keep}
for c in counts.keys():
    counts[c] = len([x for y in l for x in y if x==c])
print(counts)

In [None]:
# An example use of CRFs
%%time
crf = sklearn_crfsuite.CRF( 
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels_to_keep, digits=3
))

In [None]:
# Parameters search
%%time
crf = sklearn_crfsuite.CRF( 
    max_iterations=100,
    algorithm = 'lbfgs',
    all_possible_transitions=False
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05)
}

scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels_to_keep)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-15, 
                        n_iter=5, 
                        scoring=scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

In [None]:
# classification report
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels_to_keep, digits=3
))
# Confusion matrices
from sklearn.metrics import confusion_matrix
from code.support_functions import flatten_predictions

print(confusion_matrix(flatten_predictions(y_test), flatten_predictions(y_pred), labels=labels_to_keep))
plt.imshow(np.log(confusion_matrix(flatten_predictions(y_test), flatten_predictions(y_pred), labels=labels_to_keep)),
           cmap='Blues', interpolation='nearest')
plt.grid(False)
plt.ylabel('Ground truth', fontsize=16)
plt.xlabel('Predicted', fontsize=16)
plt.xticks(np.arange(0, len(labels_to_keep), 1))
plt.yticks(np.arange(0, len(labels_to_keep), 1))
plt.title("Confusion Matrix Model 2", fontsize=16)

In [None]:
# K-fold validation
scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels_to_keep)
# OR rs.best_params_
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c2= 0.093645710804034776, c1= 0.44740028179508301,
    max_iterations=200, 
    all_possible_transitions=True
)
k = 5

cv = cross_val_score(crf, X_rest, y_rest, cv=k, scoring=scorer, n_jobs=-2)
print("%d-fold validation mean: "%k,cv.mean())

In [None]:
# Learning curves
from code.support_functions import plot_learning_curve

# Slices of data for learning curves
train_sizes=np.linspace(0.1, 1.0, 10)
title = "Learning Curves for Model 2"
message = "M2"
# Cross validation scheme with 80-20 splits and 5 iterations per train data size (to evaluate variance)
cv = model_selection.ShuffleSplit(test_size=0.2, random_state=0)
estimator = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c2= 0.093645710804034776, c1= 0.44740028179508301,
    max_iterations=200, 
    all_possible_transitions=True
)
plot_learning_curve(estimator, title, X_rest, y_rest, labels_to_keep, cv=cv, train_sizes=train_sizes, n_jobs=-2, message=message)

In [None]:
# VALIDATION
%%time

crf = sklearn_crfsuite.CRF( 
    algorithm='lbfgs',
    c2= 0.093645710804034776, c1= 0.44740028179508301,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X_rest, y_rest)
y_pred = crf.predict(X_valid)
print(metrics.flat_classification_report(
    y_valid, y_pred, labels=labels_to_keep, digits=3
))

In [None]:
# Train final models for task 1
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c2= 0.093645710804034776, c1= 0.44740028179508301,
    max_iterations=500, 
    all_possible_transitions=True
)
crf.fit(d, l)

# save model
#joblib.dump(crf,'models/modelM2_ALL_L.pkl')

In [None]:
# load model
crf1 = joblib.load('models/modelM2_ALL_L.pkl')

In [None]:
def process_document(doc):
    for page in doc["pages"].values():
        if not page["is_annotated"]:
            data_to_tag = [text2featuresEX(page["offsets"],page["specific_tags"])]
            page_lab = crf.predict(data_to_tag)
            assert len(page_lab[0]) == len(page["offsets"])
            page.update({"BET_tags":page_lab[0]})
        else:
            page.update({"BET_tags":text2labelsTBE(page["offsets"])})
    return doc
            
threads = Pool(45)

In [None]:
# parse all
data2 = list()
for ar in threads.imap_unordered(process_document, data):
    data2.append(ar)

In [None]:
#pickle.dump(data2, open("data/data.p", "wb"))

In [None]:
# parse the references in a more json-like formar
from code.support_functions import json_outputter
_, refs, _ = json_outputter(data2, 40)

In [None]:
print(refs[10])