In [1]:
import json
from crf_feature_engineering import note2features, note2labels, note2features_withlemma, note2labels_withlemma
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
import sklearn
import pandas as pd
from sklearn_crfsuite import metrics
import eli5
import imgkit



In [2]:
# NOTE: Jupyter notebook used to present results visually in ELI5

# check sklearn version <0.24
print(sklearn.__version__)

0.22.2


In [3]:
# read the files
with open('crfsuite_data/crfsuite_data_lemma_both_sessions.json') as f:
    crfsuite_data_lemma_both_sessions = [list(x) for x in json.load(f)]

with open('crfsuite_data/crfsuite_data_no_lemma_both_sessions.json') as f:
    crfsuite_data_no_lemma_both_sessions = [list(x) for x in json.load(f)]

print("Number of notes: ", len(crfsuite_data_no_lemma_both_sessions))

Notater session 1:  62
Notater session 5:  183
Notater begge:  249


In [4]:
X = [note2features_withlemma(n) for n in crfsuite_data_lemma_both_sessions]
y = [note2labels_withlemma(n) for n in crfsuite_data_lemma_both_sessions]

In [5]:
# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [6]:
# Create a CRF model with sklearn_crfsuite
# Using L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [7]:
# Train the CRF model using the training data
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [8]:
### Evaluation ###

# remove O from labels when calculating metrics since it is very overrepresented
labels = list(crf.classes_)
labels.remove('O')

y_pred = crf.predict(X_test)

#metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print("\nReport:\n")

#print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels))

report_dict = metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, output_dict=True)
report_df = pd.DataFrame(report_dict).T
report_df = report_df.round(decimals=2)
report_df["support"] = report_df["support"].round().astype(int)

display(report_df)



Report:

                        precision    recall  f1-score   support

         B-IV_generelt       0.75      0.62      0.68        24
         I-IV_generelt       0.27      0.44      0.33         9
             B-Pasient       0.73      0.79      0.76        94
             I-Pasient       0.50      0.25      0.33         4
B-Perifert_venekateter       0.85      0.79      0.81        28
I-Perifert_venekateter       0.78      0.88      0.82         8
        B-Tegn_Symptom       0.83      0.75      0.79        57
        I-Tegn_Symptom       0.17      0.12      0.14         8
         B-Vitalt_Tegn       0.83      1.00      0.91         5
         I-Vitalt_Tegn       0.67      1.00      0.80         2

             micro avg       0.72      0.73      0.73       239
             macro avg       0.64      0.66      0.64       239
          weighted avg       0.73      0.73      0.73       239



Unnamed: 0,precision,recall,f1-score,support
B-IV_generelt,0.75,0.62,0.68,24
I-IV_generelt,0.27,0.44,0.33,9
B-Pasient,0.73,0.79,0.76,94
I-Pasient,0.5,0.25,0.33,4
B-Perifert_venekateter,0.85,0.79,0.81,28
I-Perifert_venekateter,0.78,0.88,0.82,8
B-Tegn_Symptom,0.83,0.75,0.79,57
I-Tegn_Symptom,0.17,0.12,0.14,8
B-Vitalt_Tegn,0.83,1.0,0.91,5
I-Vitalt_Tegn,0.67,1.0,0.8,2


In [9]:
# Save dataframe to csv to add into latex as a table
#report_df.to_csv('results_csv/both_sessions_lemma_classification_report.csv')

In [10]:
eli5.show_weights(crf, top=10)



From \ To,O,B-IV_generelt,I-IV_generelt,B-Pasient,I-Pasient,B-Perifert_venekateter,I-Perifert_venekateter,B-Tegn_Symptom,I-Tegn_Symptom,B-Vitalt_Tegn,I-Vitalt_Tegn
O,2.645,0.893,-3.992,0.746,-2.699,0.557,-2.749,0.839,-3.391,-0.012,-2.829
B-IV_generelt,0.269,-0.206,3.367,-1.872,-0.125,-0.658,-0.007,-0.413,-0.389,-0.15,-0.475
I-IV_generelt,-0.074,-0.118,4.23,-1.366,0.0,0.0,0.0,-0.369,-0.114,0.0,-0.279
B-Pasient,0.253,-0.011,-0.661,-0.941,3.07,0.0,-0.127,-0.273,-0.87,0.0,-0.661
I-Pasient,0.0,0.0,0.0,-0.703,1.622,0.0,0.0,-0.127,0.0,0.0,0.0
B-Perifert_venekateter,-0.329,-0.159,-0.35,0.0,0.0,-0.025,3.489,-0.187,-0.013,0.0,-0.054
I-Perifert_venekateter,-0.027,0.0,-0.001,0.0,0.0,0.0,2.085,-0.204,0.0,0.0,0.0
B-Tegn_Symptom,0.019,-0.356,-0.818,-0.434,-0.451,-0.31,-0.573,0.0,3.657,-0.123,-0.62
I-Tegn_Symptom,-0.049,-0.018,-0.087,0.0,0.0,0.0,0.0,-0.035,2.812,0.0,0.0
B-Vitalt_Tegn,-1.118,-0.238,-0.492,0.0,0.0,-0.096,0.0,-0.201,-0.242,0.103,3.502

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10
+2.235,EOS,,,,,,,,,
+2.009,bias,,,,,,,,,
… 905 more positive …,… 905 more positive …,,,,,,,,,
… 820 more negative …,… 820 more negative …,,,,,,,,,
-1.705,word[-3:]:lin,,,,,,,,,
-1.718,word.lower():atb,,,,,,,,,
-1.786,word[-3:]:lon,,,,,,,,,
-1.877,lemma:rød,,,,,,,,,
-1.978,word[-3:]:het,,,,,,,,,
-2.230,lemma:smerte,,,,,,,,,

Weight?,Feature
+2.235,EOS
+2.009,bias
… 905 more positive …,… 905 more positive …
… 820 more negative …,… 820 more negative …
-1.705,word[-3:]:lin
-1.718,word.lower():atb
-1.786,word[-3:]:lon
-1.877,lemma:rød
-1.978,word[-3:]:het
-2.230,lemma:smerte

Weight?,Feature
+2.843,lemma:Furix-infusjon
+2.526,word[-3:]:mpe
+2.255,word[-3:]:ant
+2.188,+1:word.lower():propofolinfusjon
+2.188,+1:lemma:Propofolinfusjon
+1.965,word.lower():furix-infusjonen
+1.935,word.lower():medikamentet
+1.890,lemma:infusjon
+1.886,word[-2:]:pe
+1.873,+1:word.lower():fått

Weight?,Feature
+1.534,+1:lemma:komme
+1.525,+1:word.lower():kom
+1.465,lemma:Pasienten
+1.294,-1:lemma:gå
+1.251,lemma:infusjon
+1.182,lemma:få
+1.163,-1:lemma:ikke
+1.163,-1:word.lower():ikke
+0.978,word.lower():pasienten
+0.967,-1:lemma:cordaroneinfusjon

Weight?,Feature
+3.930,lemma:pasient
+3.144,lemma:Pasient
+2.579,lemma:Pasienten
+2.524,word[-2:]:as
+2.100,word[-3:]:ten
+1.883,word.lower():patient
+1.883,lemma:Patient
+1.762,+1:lemma:Under
+1.652,word[-3:]:ent
+1.525,word.lower():pasienten

Weight?,Feature
+1.769,-1:word.lower():pas.
+1.673,+1:lemma:ta
+1.457,+1:word.lower():tok
+1.336,-1:lemma:lege
+1.329,-1:word.lower():lege
+1.221,word.isupper()
+1.122,+1:lemma:pasient
+1.111,+1:word.lower():oppdaget
+1.104,+1:lemma:oppdage
+1.031,+1:word.lower():pasient

Weight?,Feature
+4.130,word.lower():pvk
+2.214,lemma:veneflon
+2.154,word[-3:]:lon
+1.907,lemma:pvk'er
+1.907,word.lower():pvk'er
+1.907,word[-3:]:'er
+1.737,word[-2:]:vk
+1.735,word.lower():venflon
+1.729,lemma:venflon
+1.615,word[-3:]:nen

Weight?,Feature
+1.311,+1:word.lower():blitt
+1.235,-1:word.lower():koblet
+1.234,-1:lemma:koble
+1.222,-1:lemma:PVK&#8217;en
+1.222,-1:word.lower():pvk&amp;#8217;en
+1.076,-1:lemma:grønn
+0.967,lemma:venflon
+0.947,word.lower():venflon
+0.936,word[-3:]:lon
+0.927,postag:<komma>

Weight?,Feature
+2.959,lemma:smerte
+2.616,lemma:rød
+1.943,lemma:hoven
+1.923,word.lower():hovnet
+1.923,lemma:hovne
+1.831,lemma:Palpasjonsøm
+1.831,word[-3:]:søm
+1.831,word[-2:]:øm
+1.831,word.lower():palpasjonsøm
+1.780,lemma:ømhet

Weight?,Feature
+1.466,word.lower():smerter
+1.224,-1:lemma:lett
+1.224,-1:word.lower():lett
+1.119,lemma:smerte
+1.094,+1:lemma:hud
+0.935,word[-2:]:ar
+0.831,-1:word.lower():ingen
+0.831,-1:lemma:ingen
+0.824,-1:postag[:2]:pr
… 186 more positive …,… 186 more positive …

Weight?,Feature
+3.798,lemma:tungpusthet
+2.344,word.lower():respirasjon
+2.344,lemma:respirasjon
+2.110,word.lower():febril
+2.110,lemma:febril
+2.110,word[-3:]:ril
+1.832,word.lower():tungpustheten
+1.568,word.lower():tungpusthet
+1.336,word[-3:]:all
+1.321,word[-2:]:il

Weight?,Feature
+0.854,word.isdigit()
+0.793,-1:word.lower():pustet
+0.793,-1:lemma:puste
+0.762,-1:word.lower():bt
+0.709,lemma:måling
+0.709,word.lower():måling
+0.706,-1:word.lower():frysninger
+0.706,+1:lemma:Infeksjonsparameter
+0.706,-1:lemma:frysning
… 201 more positive …,… 201 more positive …


In [11]:
#expl = eli5.explain_weights(crf, top=10)
#df = eli5.format_as_dataframes(expl)
#display(df)

In [12]:
"""
eli5_report = eli5.show_weights(crf, top=10).data

with open('eli5_both_with_lemma.html', 'w') as f:
    f.write(eli5_report)
"""

"\neli5_report = eli5.show_weights(crf, top=10).data\n\nwith open('eli5_both_with_lemma.html', 'w') as f:\n    f.write(eli5_report)\n"

In [13]:
"""
options = {
    'format': 'png',
    'encoding': "UTF-8",
}

imgkit.from_file('eli5_both_with_lemma.html', 'eli5_both_with_lemma.png', options=options)
"""


'\noptions = {\n    \'format\': \'png\',\n    \'encoding\': "UTF-8",\n}\n\nimgkit.from_file(\'eli5_both_with_lemma.html\', \'eli5_both_with_lemma.png\', options=options)\n'