In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

# Get the Neo4j driver
from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(e.__class__)


## Let's use our labeled data to build a NER system

In [24]:

%%time
cu.build_child_strs_list_dictionary(verbose=False)
HEADER_PATTERN_DICT = cu.create_header_pattern_dictionary(verbose=False)

CPU times: total: 10.3 s
Wall time: 3min 48s


In [6]:

from sklearn.model_selection import train_test_split

X = []
y = []
for file_name, feature_dict_list in HEADER_PATTERN_DICT.items():
    X.append(feature_dict_list)
    pos_list = [hc.get_feature_tuple(feature_dict, pos_lr_predict_single=None, pos_crf_predict_single=None, pos_sgd_predict_single=None)[2] for feature_dict in feature_dict_list]
    y.append(pos_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=101)

## Training

To see all possible CRF parameters check its docstring. Here we are useing L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [12]:

%%time
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError as e:
    print(str(e).strip())
    pass

CPU times: total: 656 ms
Wall time: 719 ms


## Evaluation

There is much more O entities in data set, but we're more interested in other entities. To account for this we'll use averaged F1 score computed for all labels except for O. ``sklearn-crfsuite.metrics`` package provides some useful metrics for sequence classification task, including this one.

In [13]:

labels = list(crf.classes_)
labels

['O-RQ', 'H', 'O', 'O-TS', 'H-RQ', 'O-JD', 'O-SP', 'H-SP', 'H-JD', 'H-ER', 'O-PQ', 'H-OL', 'O-OL', 'H-LN', 'H-TS', 'O-O', 'H-PQ', 'H-JT', 'O-CS', 'O-LN', 'H-CS', 'H-IP', 'O-ER', 'O-JT', 'O-IP', 'H-O', 'O-PD']

In [23]:

from sklearn_crfsuite.metrics import flat_f1_score

y_pred = crf.predict(X_test)
flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.9950714267339033

Inspect per-class results in more detail:

In [22]:

# Group results by type
from sklearn.metrics import classification_report
from sklearn_crfsuite.utils import flatten

sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(classification_report(y_true=flatten(y_test), y_pred=flatten(y_pred), labels=sorted_labels))

              precision    recall  f1-score   support

           H       0.97      1.00      0.99        34
           O       1.00      1.00      1.00       303
        H-CS       1.00      1.00      1.00         1
        O-CS       1.00      1.00      1.00         3
        H-ER       0.90      0.90      0.90        10
        O-ER       1.00      0.75      0.86         4
        H-IP       1.00      1.00      1.00         3
        O-IP       0.00      0.00      0.00         0
        H-JD       1.00      1.00      1.00        11
        O-JD       1.00      1.00      1.00        28
        H-JT       0.00      0.00      0.00         0
        O-JT       0.00      0.00      0.00         1
        H-LN       1.00      1.00      1.00         4
        O-LN       1.00      1.00      1.00         2
         H-O       1.00      1.00      1.00         2
         O-O       1.00      1.00      1.00         1
        H-OL       1.00      1.00      1.00        20
        O-OL       1.00    