In [1]:
import pandas as pd
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score,flat_classification_report
from scipy.stats import expon
import warnings
import joblib

from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from SentenceGetter import SentenceGetter
from crf_feature_functions import *

In [2]:
df_train = pd.read_csv('data/gc5cdr-ncbi-disease-train.csv')
df_train.label.value_counts()

O            235444
B-DISEASE      9312
I-DISEASE      9029
Name: label, dtype: int64

In [3]:
getter = SentenceGetter(df_train)
sentences = getter.sentences

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

seed = 48910412

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=seed)

labels = df_train.label.unique().tolist()

In [4]:
df_validation = pd.read_csv('data/gc5cdr-ncbi-disease-test.csv')
#df_validation.label.value_counts()

getter2 = SentenceGetter(df_validation)
sentences2 = getter2.sentences

X_validation = [sent2features(s) for s in sentences2]
y_validation = [sent2labels(s) for s in sentences2]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### CRF using LBFGS algorithm

In [5]:
params_space_lbfgs = {
    'min_freq':[0.0,1.0,2.0,3.0],
    'c1': expon(scale=0.05),
    'c2': expon(scale=0.05),
    'all_possible_states':[True,False],
    'all_possible_transitions':[True,False],
    'max_iterations':[10,100,1000]
}

f1_scorer_lbfgs = make_scorer(flat_f1_score,average='weighted', labels=labels)

crf_lbfgs  = sklearn_crfsuite.CRF(algorithm='lbfgs',max_iterations=1000)

gs_lbfgs = RandomizedSearchCV(crf_lbfgs, params_space_lbfgs,cv=5,verbose=1,scoring=f1_scorer_lbfgs)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gs_lbfgs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 65.4min finished


In [6]:
print('best params:', gs_lbfgs.best_params_)
print('best CV score:', gs_lbfgs.best_score_)

best params: {'all_possible_states': True, 'all_possible_transitions': True, 'c1': 0.04991068081941039, 'c2': 0.06691356342155838, 'max_iterations': 100, 'min_freq': 0.0}
best CV score: 0.9797925718230948


In [7]:
lbfgs_preds1 = gs_lbfgs.predict(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_test,lbfgs_preds1,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_test,lbfgs_preds1,labels=labels[1:]))

0.9805928672011779
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.89      0.85      0.87      1857
   I-DISEASE       0.87      0.77      0.82      1802

   micro avg       0.88      0.81      0.84      3659
   macro avg       0.88      0.81      0.84      3659
weighted avg       0.88      0.81      0.84      3659



In [8]:
lbfgs_validation_predictions = gs_lbfgs.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,lbfgs_validation_predictions,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,lbfgs_validation_predictions,labels=labels[1:]))

0.9732180407294377
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.86      0.78      0.81       960
   I-DISEASE       0.85      0.78      0.81      1087

   micro avg       0.85      0.78      0.81      2047
   macro avg       0.85      0.78      0.81      2047
weighted avg       0.85      0.78      0.81      2047



In [17]:
lbfgs_crf_model = sklearn_crfsuite.CRF(algorithm ='lbfgs',
                                max_iterations =100,
                                c1 = 0.04991068081941039,
                                c2 = 0.06691356342155838,
                                min_freq = 0.0,
                                all_possible_states = True, 
                                all_possible_transitions = True
                               )
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    lbfgs_crf_model.fit(X,y)

In [20]:
lbfgs_validation_predictions2 = lbfgs_crf_model.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,lbfgs_validation_predictions2,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,lbfgs_validation_predictions2,labels=labels[1:]))

0.9714783831245982
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.84      0.76      0.80       960
   I-DISEASE       0.84      0.76      0.80      1087

   micro avg       0.84      0.76      0.80      2047
   macro avg       0.84      0.76      0.80      2047
weighted avg       0.84      0.76      0.80      2047



### CRF using L2SGD

In [9]:
params_space_l2sgd = {
    'min_freq':[0.0,1.0,2.0,3.0],
    'c2': expon(scale=0.05),
    'all_possible_states':[True,False],
    'all_possible_transitions':[True,False],
    'max_iterations':[10,100,1000]
}

f1_scorer_l2sgd = make_scorer(flat_f1_score,average='weighted', labels=labels)

crf_l2sgd  = sklearn_crfsuite.CRF(algorithm='l2sgd',max_iterations=1000)

gs_l2sgd = RandomizedSearchCV(crf_l2sgd, params_space_l2sgd,cv=5,verbose=1,scoring=f1_scorer_l2sgd)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gs_l2sgd.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 12.2min finished


In [10]:
print('best params:', gs_l2sgd.best_params_)
print('best CV score:', gs_l2sgd.best_score_)

best params: {'all_possible_states': True, 'all_possible_transitions': True, 'c2': 0.08073422801181261, 'max_iterations': 1000, 'min_freq': 0.0}
best CV score: 0.9799432693812055


In [11]:
l2sgd_preds1 = gs_l2sgd.predict(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_test,l2sgd_preds1,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_test,l2sgd_preds1,labels=labels[1:]))

0.9804848400992554
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.90      0.84      0.87      1857
   I-DISEASE       0.89      0.76      0.82      1802

   micro avg       0.89      0.80      0.84      3659
   macro avg       0.89      0.80      0.84      3659
weighted avg       0.89      0.80      0.84      3659



In [12]:
l2sgd_validation_predictions = gs_l2sgd.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,l2sgd_validation_predictions,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,l2sgd_validation_predictions,labels=labels[1:]))

0.9714168804967231
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.86      0.76      0.81       960
   I-DISEASE       0.86      0.74      0.80      1087

   micro avg       0.86      0.75      0.80      2047
   macro avg       0.86      0.75      0.80      2047
weighted avg       0.86      0.75      0.80      2047



In [21]:
l2sgd_crf_model = sklearn_crfsuite.CRF(algorithm ='l2sgd',
                                max_iterations =1000,
                                c2 = 0.08073422801181261,
                                min_freq = 0.0,
                                all_possible_states = True, 
                                all_possible_transitions = True
                               )
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    l2sgd_crf_model.fit(X,y)

In [22]:
l2sgd_validation_predictions2 = l2sgd_crf_model.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,l2sgd_validation_predictions2,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,l2sgd_validation_predictions2,labels=labels[1:]))

0.9709921726026618
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.85      0.77      0.81       960
   I-DISEASE       0.85      0.74      0.79      1087

   micro avg       0.85      0.75      0.80      2047
   macro avg       0.85      0.75      0.80      2047
weighted avg       0.85      0.75      0.80      2047



### CRF using Average Perceptron (AP) algorithm

In [13]:
params_space_ap = {
    'min_freq':[0.0,1.0,2.0,3.0],
    'all_possible_states':[True,False],
    'all_possible_transitions':[True,False],
    'max_iterations':[10,100,1000]
}

f1_scorer_ap = make_scorer(flat_f1_score,average='weighted', labels=labels)

crf_ap = sklearn_crfsuite.CRF(algorithm='ap',max_iterations=1000)

gs_ap = RandomizedSearchCV(crf_ap, params_space_ap,cv=5,verbose=1,scoring=f1_scorer_ap)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gs_ap.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 18.3min finished


In [14]:
print('best params:', gs_ap.best_params_)
print('best CV score:', gs_ap.best_score_)

best params: {'min_freq': 2.0, 'max_iterations': 100, 'all_possible_transitions': True, 'all_possible_states': True}
best CV score: 0.9786478301549139


In [15]:
ap_preds1 = gs_ap.predict(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_test,ap_preds1,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_test,ap_preds1,labels=labels[1:]))

0.9799889073370782
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.87      0.84      0.86      1857
   I-DISEASE       0.90      0.75      0.82      1802

   micro avg       0.88      0.80      0.84      3659
   macro avg       0.88      0.80      0.84      3659
weighted avg       0.88      0.80      0.84      3659



In [16]:
ap_validation_predictions = gs_ap.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,ap_validation_predictions,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,ap_validation_predictions,labels=labels[1:]))

0.9726788008734242
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.84      0.79      0.81       960
   I-DISEASE       0.90      0.73      0.81      1087

   micro avg       0.87      0.76      0.81      2047
   macro avg       0.87      0.76      0.81      2047
weighted avg       0.87      0.76      0.81      2047



In [7]:
#((0.79 * 1187) + (0.71 * 522))/
(1187 + 522)

1709

In [23]:
ap_crf_model = sklearn_crfsuite.CRF(algorithm ='ap',
                                max_iterations =100,
                                min_freq = 2.0,
                                all_possible_states = True, 
                                all_possible_transitions = True
                               )
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    ap_crf_model.fit(X,y)

In [24]:
ap_validation_predictions2 = ap_crf_model.predict(X_validation)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(flat_f1_score(y_validation,ap_validation_predictions2,average="weighted"))
    print("---------------------------------------------------------------")
    print(flat_classification_report(y_validation,ap_validation_predictions2,labels=labels[1:]))

0.9725042699419075
---------------------------------------------------------------
              precision    recall  f1-score   support

   B-DISEASE       0.83      0.78      0.81       960
   I-DISEASE       0.90      0.74      0.81      1087

   micro avg       0.86      0.76      0.81      2047
   macro avg       0.86      0.76      0.81      2047
weighted avg       0.86      0.76      0.81      2047



In [25]:
import joblib
import pickle
import os

In [26]:
model_file = os.path.join(os.getcwd(),'models','disease','disease-ner.sav')
pickle.dump(gs_ap,open(model_file,'wb'))