In [1]:
import pandas as pd
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score,flat_classification_report
from scipy.stats import expon
import warnings

from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from SentenceGetter import SentenceGetter
from crf_feature_functions import *

In [2]:
df_train = pd.read_csv('data/new-anem-train-v2.csv')
df_train.label.value_counts()

O                      32517
B-ANATOMICAL-ENTITY     1856
I-ANATOMICAL-ENTITY      973
Name: label, dtype: int64

In [9]:
getter = SentenceGetter(df_train)
sentences = getter.sentences

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

seed = 48910412

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=seed)

labels = df_train.label.unique().tolist()

In [11]:
df_validation = pd.read_csv('data/new-anem-test-v2.csv')
#df_validation.label.value_counts()

getter2 = SentenceGetter(df_validation)
sentences2 = getter2.sentences

X_validation = [sent2features(s) for s in sentences2]
y_validation = [sent2labels(s) for s in sentences2]

### CRF using LBFGS algorithm

In [12]:
params_space_lbfgs = {
    'min_freq':[0.0,1.0,2.0,3.0],
    'c1': expon(scale=0.05),
    'c2': expon(scale=0.05),
    'all_possible_states':[True,False],
    'all_possible_transitions':[True,False],
    'max_iterations':[10,100,1000]
}

f1_scorer_lbfgs = make_scorer(flat_f1_score,average='weighted', labels=labels)

crf_lbfgs  = sklearn_crfsuite.CRF(algorithm='lbfgs',max_iterations=1000)

gs_lbfgs = RandomizedSearchCV(crf_lbfgs, params_space_lbfgs,cv=5,verbose=1,scoring=f1_scorer_lbfgs)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gs_lbfgs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 11.1min finished


In [13]:
print('best params:', gs_lbfgs.best_params_)
print('best CV score:', gs_lbfgs.best_score_)

best params: {'all_possible_states': True, 'all_possible_transitions': False, 'c1': 0.04203733119847909, 'c2': 0.032226638122811714, 'max_iterations': 1000, 'min_freq': 0.0}
best CV score: 0.9745789646873275


In [14]:
lbfgs_preds1 = gs_lbfgs.predict(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_test,lbfgs_preds1,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_test,lbfgs_preds1,labels=labels))

0.9815196672076048
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.99      1.00      0.99      6688
B-ANATOMICAL-ENTITY       0.92      0.85      0.88       386
I-ANATOMICAL-ENTITY       0.90      0.79      0.84       191

           accuracy                           0.98      7265
          macro avg       0.93      0.88      0.90      7265
       weighted avg       0.98      0.98      0.98      7265



In [15]:
lbfgs_validation_predictions = gs_lbfgs.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,lbfgs_validation_predictions,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,lbfgs_validation_predictions,labels=labels))

0.9689920373471913
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.98      0.99      0.99     21561
B-ANATOMICAL-ENTITY       0.87      0.71      0.78      1187
I-ANATOMICAL-ENTITY       0.81      0.63      0.71       522

           accuracy                           0.97     23270
          macro avg       0.89      0.78      0.83     23270
       weighted avg       0.97      0.97      0.97     23270



In [16]:
lbfgs_crf_model = sklearn_crfsuite.CRF(algorithm ='lbfgs',
                                max_iterations =1000,
                                c1 = 0.04203733119847909,
                                c2 = 0.032226638122811714,
                                min_freq = 0.0,
                                all_possible_states = True, 
                                all_possible_transitions = False
                               )
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    lbfgs_crf_model.fit(X,y)

In [17]:
lbfgs_validation_predictions2 = lbfgs_crf_model.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,lbfgs_validation_predictions2,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,lbfgs_validation_predictions2,labels=labels))

0.9694836118242657
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.98      0.99      0.99     21561
B-ANATOMICAL-ENTITY       0.87      0.72      0.79      1187
I-ANATOMICAL-ENTITY       0.84      0.61      0.71       522

           accuracy                           0.97     23270
          macro avg       0.89      0.77      0.83     23270
       weighted avg       0.97      0.97      0.97     23270



### CRF using L2SGD

In [18]:
params_space_l2sgd = {
    'min_freq':[0.0,1.0,2.0,3.0],
    'c2': expon(scale=0.05),
    'all_possible_states':[True,False],
    'all_possible_transitions':[True,False],
    'max_iterations':[10,100,1000]
}

f1_scorer_l2sgd = make_scorer(flat_f1_score,average='weighted', labels=labels)

crf_l2sgd  = sklearn_crfsuite.CRF(algorithm='l2sgd',max_iterations=1000)

gs_l2sgd = RandomizedSearchCV(crf_l2sgd, params_space_l2sgd,cv=5,verbose=1,scoring=f1_scorer_l2sgd)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gs_l2sgd.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.0min finished


In [19]:
print('best params:', gs_l2sgd.best_params_)
print('best CV score:', gs_l2sgd.best_score_)

best params: {'all_possible_states': True, 'all_possible_transitions': False, 'c2': 0.0016380793655586846, 'max_iterations': 1000, 'min_freq': 0.0}
best CV score: 0.9750586586104447


In [20]:
l2sgd_preds1 = gs_l2sgd.predict(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_test,l2sgd_preds1,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_test,l2sgd_preds1,labels=labels))

0.9824427258546494
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.99      0.99      0.99      6688
B-ANATOMICAL-ENTITY       0.92      0.86      0.89       386
I-ANATOMICAL-ENTITY       0.90      0.81      0.85       191

           accuracy                           0.98      7265
          macro avg       0.94      0.89      0.91      7265
       weighted avg       0.98      0.98      0.98      7265



In [21]:
l2sgd_validation_predictions = gs_l2sgd.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,l2sgd_validation_predictions,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,l2sgd_validation_predictions,labels=labels))

0.9692449840762487
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.98      0.99      0.99     21561
B-ANATOMICAL-ENTITY       0.86      0.72      0.79      1187
I-ANATOMICAL-ENTITY       0.80      0.64      0.71       522

           accuracy                           0.97     23270
          macro avg       0.88      0.78      0.83     23270
       weighted avg       0.97      0.97      0.97     23270



In [24]:
l2sgd_crf_model = sklearn_crfsuite.CRF(algorithm ='l2sgd',
                                max_iterations =1000,
                                c2 = 0.0016380793655586846,
                                min_freq = 0.0,
                                all_possible_states = True, 
                                all_possible_transitions = False
                               )
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    l2sgd_crf_model.fit(X,y)

In [25]:
l2sgd_validation_predictions2 = l2sgd_crf_model.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,l2sgd_validation_predictions2,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,l2sgd_validation_predictions2,labels=labels))

0.968058342881421
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.98      0.99      0.99     21561
B-ANATOMICAL-ENTITY       0.87      0.70      0.78      1187
I-ANATOMICAL-ENTITY       0.83      0.60      0.69       522

           accuracy                           0.97     23270
          macro avg       0.89      0.77      0.82     23270
       weighted avg       0.97      0.97      0.97     23270



### CRF using Average Perceptron (AP) algorithm

In [26]:
params_space_ap = {
    'min_freq':[0.0,1.0,2.0,3.0],
    'all_possible_states':[True,False],
    'all_possible_transitions':[True,False],
    'max_iterations':[10,100,1000]
}

f1_scorer_ap = make_scorer(flat_f1_score,average='weighted', labels=labels)

crf_ap = sklearn_crfsuite.CRF(algorithm='ap',max_iterations=1000)

gs_ap = RandomizedSearchCV(crf_ap, params_space_ap,cv=5,verbose=1,scoring=f1_scorer_ap)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gs_ap.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  4.8min finished


In [27]:
print('best params:', gs_ap.best_params_)
print('best CV score:', gs_ap.best_score_)

best params: {'min_freq': 0.0, 'max_iterations': 1000, 'all_possible_transitions': True, 'all_possible_states': True}
best CV score: 0.9739323851775911


In [28]:
ap_preds1 = gs_ap.predict(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_test,ap_preds1,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_test,ap_preds1,labels=labels))

0.9808707389938891
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.99      0.99      0.99      6688
B-ANATOMICAL-ENTITY       0.89      0.86      0.87       386
I-ANATOMICAL-ENTITY       0.89      0.81      0.85       191

           accuracy                           0.98      7265
          macro avg       0.92      0.89      0.90      7265
       weighted avg       0.98      0.98      0.98      7265



In [29]:
ap_validation_predictions = gs_ap.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,ap_validation_predictions,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,ap_validation_predictions,labels=labels))

0.9661680096443059
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.98      0.99      0.98     21561
B-ANATOMICAL-ENTITY       0.81      0.71      0.76      1187
I-ANATOMICAL-ENTITY       0.74      0.63      0.68       522

           accuracy                           0.97     23270
          macro avg       0.84      0.78      0.81     23270
       weighted avg       0.97      0.97      0.97     23270



In [30]:
ap_crf_model = sklearn_crfsuite.CRF(algorithm ='ap',
                                max_iterations =1000,
                                min_freq = 0.0,
                                all_possible_states = True, 
                                all_possible_transitions = True
                               )
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    ap_crf_model.fit(X,y)

In [31]:
ap_validation_predictions2 = ap_crf_model.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,ap_validation_predictions2,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,ap_validation_predictions2,labels=labels))

0.9669731755050099
---------------------------------------------------------------
                     precision    recall  f1-score   support

                  O       0.98      0.99      0.98     21561
B-ANATOMICAL-ENTITY       0.81      0.72      0.77      1187
I-ANATOMICAL-ENTITY       0.76      0.63      0.69       522

           accuracy                           0.97     23270
          macro avg       0.85      0.78      0.81     23270
       weighted avg       0.97      0.97      0.97     23270



### Clean UP

In [32]:
import joblib
import pickle
import os

In [33]:
model_file = os.path.join(os.getcwd(),'models','anatomy-1','anatomy1-ner.sav')
pickle.dump(lbfgs_crf_model,open(model_file,'wb'))

In [34]:
loaded_model = pickle.load(open(model_file,'rb'))

In [35]:
loaded_model.score(X_validation,y_validation)

0.9712075633863343