In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Eventually, for Anaconda warnings.
# Can be commented out.
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Load basic libraries
import seaborn; seaborn.set()
import pickle, copy, json
import numpy as np
import scipy.stats
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split
from sklearn.externals import joblib
from sklearn_crfsuite import scorers, metrics
import sklearn_crfsuite
from multiprocessing import Pool

In [4]:
# Load dataset
data = json.load(open('dataset/annotated_dataset.json')) # remember to unzip it!
report = pickle.load(open("dataset/report.p", "rb"))

In [5]:
# number of documents (wither monographs or journal issues), with annotations
print(len(data))

340


In [7]:
for r,v in report.items():
    print(r+ " : "+str(v))

total_general_m : 11360
avg annotated docs over total docs : 0.2476397966594045
total_specific : 157768
total_general : 41071
total_annotations : 198839
overall_documents : 1377
total_general_per_class_nofilter : {'meta-annotation': 6587, 'secondary-partial': 17306, 'primary-full': 4055, 'primary-partial': 3591, 'secondary-full': 16447}
avg annotated pages per annotated doc : 16.994134897360702
avg annotated pages over total pages : 0.029432229688206284
total_specific_per_class : defaultdict(<class 'int'>, {'note': 710, 'volume': 4740, 'archivalfond': 440, 'abbreviation': 4713, 'century': 10, 'conjunction': 3131, 'curator': 1, 'topicdate': 23, 'editor': 3263, 'folder': 598, 'foliation': 1549, 'parte': 231, 'publicationnumber-year': 2583, 'series': 577, 'year': 316, 'cedola': 9, 'box': 1782, 'table': 75, 'codex': 908, 'date': 3449, 'column': 412, 'period': 27, 'other': 344, 'mazzo': 23, 'publicationyear': 14477, 'archivalunit': 442, 'fascicolo': 251, 'pagination': 18055, 'numbering': 40

In [6]:
# example of contents in the annotated dataset
data[10]

{'bid': 'CFI0056764',
 'doc_number': '',
 'doc_type': 'monograph',
 'pages': {'2': {'is_annotated': True,
   'offsets': [[['L.', 20, 21, 3, 2, 'CFI0056764'],
     [False, False, ''],
     ['secondary-full', 'author', 'b', 'b-secondary-full']],
    [['Balsamo,', 23, 30, 4, 2, 'CFI0056764'],
     [False, False, ''],
     ['secondary-full', 'author', 'i', 'i-secondary-full']],
    [['Giovanti’Angelo', 32, 46, 5, 2, 'CFI0056764'],
     [False, False, ''],
     ['secondary-full', 'title', 'i', 'i-secondary-full']],
    [['Scinzenzeler', 48, 59, 6, 2, 'CFI0056764'],
     [False, False, ''],
     ['secondary-full', 'title', 'i', 'i-secondary-full']],
    [['tipografo', 62, 70, 7, 3, 'CFI0056764'],
     [False, False, ''],
     ['secondary-full', 'title', 'i', 'i-secondary-full']],
    [['in', 72, 73, 8, 3, 'CFI0056764'],
     [False, False, ''],
     ['secondary-full', 'title', 'i', 'i-secondary-full']],
    [['Milano', 75, 80, 9, 3, 'CFI0056764'],
     [False, False, ''],
     ['secondary-fu

Tag consolidation to remove unfrequently used tags

In [None]:
# Specific Tag consolidation
correspondances = {
    'abbreviatedtitle': 'title', #
    'abbreviation': 'abbreviation', 
    'appendix': 'ref', #
    'archivalfond': 'archivalreference', #
    'archivalreference': 'archivalreference',
    'archivalseries': 'archivalreference', #
    'archivalunit': 'archivalreference', #
    'archive': 'archive_lib', #
    'attachment': 'attachment',
    'author': 'author',
    'box': 'box',
    'cartulation': 'cartulation', 
    'cedola': 'ref', #
    'century': 'date', #
    'chapter': 'ref', #
    'citation': 'ref', #
    'codex': 'archivalreference', #
    'column': 'column',
    'conjunction': 'conjunction', 
    'curator': 'author', #
    'date': 'date', 
    'editor': 'author', #
    'fascicolo': 'folder', #
    'filza': 'filza',
    'folder': 'folder',
    'foliation': 'foliation',
    'fond': 'archivalreference', #
    'implicit': '',
    'library': 'archive_lib', #
    'mazzo': 'ref', #
    'notary': 'archivalreference', #
    'note': 'numbered_ref', #
    'numbering': 'numbered_ref', #
    'other': '', #
    'pagination': 'pagination',
    'parchment': 'ref', #
    'period': 'date', #
    'protocollo': 'ref', #
    'parte': 'ref', #
    'publicationnumber': 'publicationnumber-year', #
    'publicationnumber-year': 'publicationnumber-year',
    'publicationplace': 'publicationplace',
    'publicationspecifications': 'publicationspecifications',
    'publicationyear': 'year', #
    'publisher': 'publisher',
    'registry': 'registry',
    'responsible': 'author', #
    'series': 'series',
    'table': 'ref', #
    'title': 'title',
    'tomo': 'tomo',
    'topicdate': 'publicationplace', #
    'voce': 'ref', #
    'volume': 'volume',
    'website': 'ref', #
    'year': 'year',
    '': ''
}

# define supporting functions
window = 2 # the window of dependance for the CRFs: use plus and minus that number as a context
from code.feature_extraction_words import word2features, generate_featuresLight
def text2features(text):
    return [word2features(text, i, window = window) for i in range(len(text))]
def text2featuresL(text):
    return [word2features(text, i, window = window, feature_function=generate_featuresLight) for i in range(len(text))]

# create beginend tags Y
def text2labelsBE(text):
    return [token[2][2] for token in text]

# create tagged-beginend tags Y
def text2labelsTBE(text):
    return [token[2][3] for token in text]

# create specific tags Y
def text2labelsS(text):
    return [correspondances[token[2][1]] for token in text]

In [None]:
# prepare annotated data for CRF
annotated_data = list()
for doc in data:
    doc_data_ann = list()
    for page in doc["pages"].values():
        if page["is_annotated"]:
            doc_data_ann.extend(page["offsets"])
    if len(doc_data_ann) > 0:
        annotated_data.append(doc_data_ann)
print(len(annotated_data))
print(len(data))

In [None]:
%%time
# Define train and test sets
d = [text2features(text) for text in annotated_data]
l = [text2labelsS(text) for text in annotated_data]
l_tbe = [text2labelsTBE(text) for text in annotated_data]
# Clean tag space
labels_to_keep = list(set([x for y in l for x in y]))
labels_to_keep_tbe = list(set([x for y in l_tbe for x in y]))

random_state = np.random.RandomState(0)
    
# VALIDATION set
X_rest, X_valid, y_rest, y_valid = train_test_split(d, l, test_size=0.1, random_state=random_state)
# TRAIN/TEST
X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=0.25, random_state=random_state)

In [None]:
# Count labels
counts = {x:0 for x in labels_to_keep}
for c in counts.keys():
    counts[c] = len([x for y in l for x in y if x==c])
print(counts)
counts = {x:0 for x in labels_to_keep_tbe}
for c in counts.keys():
    counts[c] = len([x for y in l_tbe for x in y if x==c])
print(counts)

In [None]:
# An example use of CRFs
crf = sklearn_crfsuite.CRF( 
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels_to_keep, digits=3
))

In [None]:
%%time
# Parameters search
crf = sklearn_crfsuite.CRF( 
    max_iterations=100,
    algorithm = 'lbfgs',
    all_possible_transitions=False
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05)
}

from code.support_functions import BER
scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels_to_keep)
#scorer = make_scorer(BER, greater_is_better=True)
    
# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-10, 
                        n_iter=5, 
                        scoring=scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

In [None]:
# classification report
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels_to_keep, digits=3
))

# Confusion matrices
from code.support_functions import flatten_predictions

print(confusion_matrix(flatten_predictions(y_test), flatten_predictions(y_pred), labels=labels_to_keep))
plt.imshow(np.log(confusion_matrix(flatten_predictions(y_test), flatten_predictions(y_pred), labels=labels_to_keep)),
           cmap='Blues', interpolation='nearest')
plt.grid(False)
plt.ylabel('Ground truth', fontsize=16)
plt.xlabel('Predicted', fontsize=16)
plt.xticks(np.arange(0, len(labels_to_keep), 1))
plt.yticks(np.arange(0, len(labels_to_keep), 1))
plt.title("Confusion Matrix Model 1", fontsize=16)

In [None]:
# K-fold validation
from support_functions import BER
scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels_to_keep)
scorer2 = make_scorer(BER, greater_is_better=True)

# plug here your params, or use rs.best_params_
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    # the parameters used for the results reported in the publication
    c2= 0.067592883332507694, c1= 0.37774963191685329,
    max_iterations=200, 
    all_possible_transitions=True
)
k = 5

cv = cross_val_score(crf, X_rest, y_rest, cv=k, scoring=scorer, n_jobs=-2)
print("%d-fold validation mean: "%k,cv.mean())
cv = cross_val_score(crf, X_rest, y_rest, cv=k, scoring=scorer2, n_jobs=-2)
print("%d-fold validation mean: "%k,cv.mean())

In [None]:
# Learning curves
# Watch out, this takes up a lot of primary memory
from code.support_functions import plot_learning_curve

# Slices of data for learning curves
train_sizes=np.linspace(0.1, 1.0, 10)
title = "Learning Curves for Model 1"
message = "M1"
# Cross validation scheme with 80-20 splits and 3 iterations per train data size (to evaluate variance)
cv = model_selection.ShuffleSplit(test_size=0.2, random_state=0)
estimator = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c2= 0.067592883332507694, c1= 0.37774963191685329,
    max_iterations=100, 
    all_possible_transitions=False
)
plot_learning_curve(estimator, title, X_rest, y_rest, labels_to_keep, cv=cv, train_sizes=train_sizes, n_jobs=20, message=message)

In [None]:
# VALIDATION
%%time

crf = sklearn_crfsuite.CRF( 
    algorithm='lbfgs',
    c2= 0.067592883332507694, c1= 0.37774963191685329,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X_rest, y_rest)
y_pred = crf.predict(X_valid)
print(metrics.flat_classification_report(
    y_valid, y_pred, labels=labels_to_keep, digits=3
))

In [None]:
# Train final models for task 1
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c2= 0.067592883332507694, c1= 0.37774963191685329,
    max_iterations=500, 
    all_possible_transitions=True
)
crf.fit(d, l)

# save model
#joblib.dump(crf,'models/modelM1_ALL_L.pkl')

Apply model to new data. 
This is needed if you want to use the results from the first parser as features for the second one.

In [None]:
# load model
crf1 = joblib.load('models/modelM1_ALL_L.pkl')

In [None]:
def process_document(doc):
    for page in doc["pages"].values():
        if not page["is_annotated"]:
            data_to_tag = [text2features(page["offsets"])]
            page_lab = crf1.predict(data_to_tag)
            assert len(page_lab[0]) == len(page["offsets"])
            page.update({"specific_tags":page_lab[0]})
        else:
            page.update({"specific_tags":text2labelsS(page["offsets"])})
    return doc
            
threads = Pool(45)

In [None]:
# parse all
data2 = list()
for ar in threads.imap_unordered(process_document, data):
    data2.append(ar)

In [None]:
pickle.dump(data2, open("dataset/data.p", "wb"))