In [1]:

%pprint

Pretty printing has been turned OFF


In [4]:

%%time
%run ../py/sql_utlis.py

su = SqlUtilities()
conn, cursor = su.get_jh_conn_cursor()
su.build_child_strs_list_dictionary(cursor, verbose=False)
su.create_header_pattern_dictionary(cursor, verbose=False)

Wall time: 8min 25s



## Let's use our labeled data to build a NER system

In [6]:

assert su.s.pickle_exists('HEADER_PATTERN_DICT')
HEADER_PATTERN_DICT = su.s.load_object('HEADER_PATTERN_DICT')

In [7]:

%%time
%run ../py/html_analysis.py
hc = HeaderCategories()

Wall time: 130 ms


In [8]:

from sklearn.model_selection import train_test_split

X = []
y = []
for file_name, feature_dict_list in HEADER_PATTERN_DICT.items():
    X.append(feature_dict_list)
    pos_list = [hc.get_feature_tuple(feature_dict)[2] for feature_dict in feature_dict_list]
    y.append(pos_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=101)

## Training

To see all possible CRF parameters check its docstring. Here we are useing L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [9]:

%%time
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 4.34 s




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

## Evaluation

There is much more O entities in data set, but we're more interested in other entities. To account for this we'll use averaged F1 score computed for all labels except for O. ``sklearn-crfsuite.metrics`` package provides some useful metrics for sequence classification task, including this one.

In [10]:

labels = list(crf.classes_)
labels

['H', 'H-RQ', 'O', 'H-OL', 'H-TS', 'O-RQ', 'O-LN', 'H-SP', 'O-SP', 'H-JD', 'O-OL', 'H-ER', 'O-ER', 'H-CS', 'O-CS', 'H-PQ', 'H-IP', 'O-TS', 'H-LN', 'H-JT', 'O-IP', 'H-PD', 'H-O', 'O-PQ', 'O-O']

In [11]:

from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

  average, "true nor predicted", 'F-score is', len(true_sum)


0.991907836016953

Inspect per-class results in more detail:

In [12]:

# Group results by type
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           H      0.956     1.000     0.977        65
           O      0.996     1.000     0.998       724
        H-CS      1.000     1.000     1.000        11
        O-CS      1.000     0.500     0.667         2
        H-ER      1.000     1.000     1.000        11
        O-ER      0.000     0.000     0.000         0
        H-IP      1.000     1.000     1.000         4
        O-IP      0.000     0.000     0.000         0
        H-JD      1.000     1.000     1.000        17
        H-JT      1.000     0.667     0.800         3
        H-LN      0.000     0.000     0.000         2
        O-LN      0.000     0.000     0.000         0
         H-O      0.000     0.000     0.000         0
         O-O      0.000     0.000     0.000         0
        H-OL      1.000     1.000     1.000        30
        O-OL      1.000     1.000     1.000        15
        H-PD      0.000     0.000     0.000         0
        H-PQ      1.000    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
