In [1]:
from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
import pandas as pd
from medcat.meta_cat import MetaCAT
import numpy as np
import json
from medcat.utils.helpers import run_cv

  from tqdm.autonotebook import tqdm


# Config

In [2]:
cdb_path = "20210304_basic_deid_cdb_wtout_names.dat"
cdb_save_path = "../Train MedCAT | NER+L/Data/20210304_supervised_basic_deid_cdb_wtout_names.dat"
vocab_path = "/Users/shek/Desktop/medcat/kch_vocab_300.dat"
data_path = "../Train MedCAT | NER+L/Data/mct-exports/MedCAT_Export_With_Text_2021-03-18_20_02_52.json"
groups_path = None # Path to the json file with groups

filter_path = None # filter onlt specific concepts to train

# Preprocessing

In [23]:
t_file = json.load(open(data_path, 'r'))

In [26]:

# Correct url ann to Website
proj = 0
for project in t_file['projects']:
    docu = 0
    for doc in project['documents']:
        anns = 0
        for ann in doc['annotations']:
            if ann['cui'] == 'C2500':
                t_file['projects'][proj]['documents'][docu]['annotations'][anns]['cui'] = 'W5000'
                print("Changed C2500 to W5000")
            anns += 1
        docu += 1
    proj += 1
            

# Train MedCAT

In [3]:
# Config the new model 
cdb = CDB.load(cdb_path)
vocab = Vocab.load(path=vocab_path)

cdb.config.ner['min_name_len'] = 2
cdb.config.ner['upper_case_limit_len'] = 3
cdb.config.general['spell_check'] = True
cdb.config.linking['train_count_threshold'] = 10
cdb.config.linking['similarity_threshold'] = 0.3
cdb.config.linking['train'] = True
cdb.config.linking['disamb_length_limit'] = 5
cdb.config.general['full_unlink'] = True
cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)

In [None]:
#cat.cdb.reset_cui_count(n=30)

# Set filter

In [4]:
if filter_path is not None:
    concept_filter = json.load(open(filter_path, 'r'))

# Train

In [None]:
fp, fn, tp, p, r, f1, cnts, examples = cat.train_supervised(data_path=data_path,
                                                            print_stats=True,
                                                            use_filters=False,
                                                            reset_cui_count=False,
                                                            nepochs=5,
                                                            never_terminate=False,
                                                            terminate_last=True,
                                                            devalue_others=True,
                                                            )

# Save 

In [6]:
cat.cdb.save(cdb_save_path)
cdb.config.linking['train'] = False

# Test model

In [3]:
# Load cdb
cdb = CDB.load(cdb_save_path)

In [6]:
cdb.cui2count_train

{'R0000': 0,
 'N1000': 0,
 'C2000': 0,
 'H3000': 0,
 'D4000': 170,
 'N1100': 15670,
 'N1200': 20200,
 'C2100': 5,
 'C2110': 5330,
 'C2120': 5150,
 'C2200': 4220,
 'C2300': 3885,
 'C2400': 0,
 'C2410': 0,
 'C2420': 0,
 'C2430': 15,
 'H3100': 1625,
 'H3200': 3540,
 'H3300': 2915,
 'H3400': 965,
 'H4100': 1950,
 'C2500': 25,
 'N1300': 880,
 'H3500': 10}

In [27]:
cdb.cui2names.keys()

dict_keys(['R0000', 'N1000', 'C2000', 'H3000', 'D4000', 'N1100', 'N1200', 'C2100', 'C2110', 'C2120', 'C2200', 'C2300', 'C2400', 'C2410', 'C2420', 'C2430', 'H3100', 'H3200', 'H3300', 'H3400', 'H4100', 'C2500', 'N1300', 'H3500'])

In [None]:
text_sample = """
Dear Dr False,

Patient DOB: 13/01/2018

It was a pleasure to meet John Doe in my clinic today. John suffered from chest pains but it turned out to be fine.



Thank you.

Sincerely,
DR Faker

Imagainary hospital
Golden gate
se26 4rl
"""

# Print Stats

### If groups run the two cells below

In [None]:
if groups_path is not None:
    groups = json.load(open(groups_path))
else:
    groups = None

In [None]:
if groups is not None:
    for cui in cdb.cui2info.keys():
        if "group" in cdb.cui2info[cui]:
            del cdb.cui2info[cui]['group']
        
    for k,v in groups.items():
        for val in v:
            cat.add_cui_to_group(val, k)

## Stats

In [None]:
use_groups = False
if groups is not None:
    use_groups = True

data = json.load(open(data_path))
fp, fn, tp, p, r, f1, cnts, examples = cat._print_stats(data, use_filters=True, use_cui_doc_limit=True, use_groups=use_groups)

In [None]:
if groups is not None:
    chosen_cui = [cui for cui in list(groups.keys())]
else:
    chosen_cui = list(f1.keys())

In [None]:
df_data = [("NumAnnotatedExamples", "F1", "P", "R", "Name", "CUI", "Type", "Scores Calculated On")]
for c in chosen_cui:
    df_data.append([cnts.get(c, 0), f1.get(c, 0), p.get(c, 0), r.get(c, 0), cdb.cui2pretty_name.get(c, c), c, cdb.tui2name.get(cdb.cui2tui.get(c, 'unk'), 'unk'), "Train Set"])

df = pd.DataFrame(df_data[1:], columns=df_data[0])
df.to_csv("results_train.csv", index=False)
print(len(df))
df.head(n=20)

### Run CV if necesssary

In [None]:
from medcat.utils.helpers import run_cv

In [None]:
fps, fns, tps, ps, rs, f1s, cnts, examples = run_cv(cdb_path, data_path, vocab_path, cv=2, nepochs=1, 
                                                            lr=1,
                                                            groups=groups,
                                                            anneal=True,
                                                            print_stats=True, 
                                                            use_filters=True, 
                                                            reset_cui_count=True,
                                                            never_terminate=ignore_termination,
                                                            use_cui_doc_limit=True)

In [None]:
f1 = {}
r = {}
p = {}
tp = {}
fp = {}
fn = {}
for key in f1s.keys():
    f1[key] = np.average(f1s[key])
    p[key] = np.average(ps[key])
    r[key] = np.average(rs[key])
    
    tp[key] = int(np.ceil(np.average(tps.get(key, [0]))))
    fp[key] = int(np.ceil(np.average(fps.get(key, [0]))))
    fn[key] = int(np.ceil(np.average(fns.get(key, [0]))))

In [None]:
# Used to get counts from the whole dataset
data = json.load(open(data_path))
_, _, _, _, _, _, cnts, examples = cat._print_stats(data, use_filters=True, use_cui_doc_limit=False)

In [None]:
if groups is not None:
    chosen_cui = [cui for cui in list(groups.keys())]
else:
    chosen_cui = list(f1.keys())

In [None]:
df_data = [("NumAnnotatedExamples", "F1", "P", "R", "Name", "CUI", "Type", "Scores Calculated On")]
for c in chosen_cui:
    df_data.append([cnts.get(c, 0), f1.get(c, 0), p.get(c, 0), r.get(c, 0), cdb.cui2pretty_name.get(c, c), c, cdb.tui2name.get(cdb.cui2tui.get(c, 'unk'), 'unk'), "Test Set"])

df = pd.DataFrame(df_data[1:], columns=df_data[0])
df.to_csv("results_test.csv", index=False)
print(len(df))
df.head(n=20)