In [1]:
import psycopg2
from collections import defaultdict
import re
from collections import Counter
from estnltk import Text
import pickle
from tqdm import tqdm
import csv

In [2]:
import os
import sys

In [3]:
sys.path.append(re.sub('cda_data_cleaning/fact_extraction/diag_text_parsing/I_diag_explorations', '', os.getcwd()))

In [4]:
from cda_data_cleaning.fact_extraction.diag_text_parsing.development.taggers.diagnosis_text_tagger import DiagnosisTextTagger
from cda_data_cleaning.fact_extraction.diag_text_parsing.development.taggers.diag_stage_tagger import DiagnosisStageTagger
from cda_data_cleaning.fact_extraction.measurement_extraction.taggers.robust_date_number_tagger.robust_date_number_tagger import RobustDateNumberTagger

INFO:base_parser.py:30: Loaded []


In [5]:
diag_tagger = DiagnosisTextTagger()
stage_tagger = DiagnosisStageTagger()
date_tagger = RobustDateNumberTagger()

In [6]:
real_extras = pickle.load(open("../all_diag_explorations2/real_extras2.pickle", "rb"))

In [7]:
i_texts = {}
for thing, count in real_extras.items():
    if thing[1] == 'I00-I99': 
        i_texts[thing] = count

In [8]:
tagged_texts = []
not_tagged_texts = []
not_tagged_rows = Counter()

for thing, count in tqdm(i_texts.items()):

    date3 = 0
    
    text = Text(thing[0])
    diag_tagger.tag(text)
    stage_tagger.tag(text)
    date_tagger.tag(text)
    
    if len(text['diagnosis']) > 0 or len(text['stages']) > 0:
        tagged_texts.append(text)
        
    elif len(text['dates_numbers']) > 0 and len(text['diagnosis']) == 0 and len(text['stages']) == 0:
        for date in text['dates_numbers']:
            if date.grammar_symbol[0] == 'DATE':
                tagged_texts.append(text)
                break
        else:  
            not_tagged_texts.append(text) 
            not_tagged_rows[thing] = count
    else:            
        not_tagged_texts.append(text) 
        not_tagged_rows[thing] = count

100%|██████████| 3191/3191 [00:02<00:00, 1343.85it/s]


In [9]:
len(i_texts)

3191

In [10]:
len(tagged_texts)

2255

In [11]:
len(not_tagged_texts)

936

In [12]:
for idx, i in enumerate(tagged_texts):
    if idx%10 == 0:
        print(i.text)
        print('\t===>')
        for j in i.diagnosis:
            print('\t', j.text, ': ', j.grammar_symbol[0])
        for d in i.dates_numbers:   
            if d.grammar_symbol[0] == 'DATE':
                print('\t', d.text, ': ', 'DATE')
        for s in i.stages:
            print('\t', s.text, ': ', s.grammar_symbol[0])
        print('---------')
        print()

Vasakpoolne spastiline hemiparees
	===>
	 hemipare :  HEMIPARESIS
---------

(18.01.2011)
	===>
	 18.01.2011 :  DATE
---------

(dekompensatsioon: kopsupais, kopsuturse, perifeersed tursed --> NYHA III aste)
	===>
	 dekompens :  SPECIFICATION
	 NYHA III :  STAGE
	 III aste :  STAGE
---------

I kahtlus.
	===>
	 kahtlus :  SPECIFICATION
	 I  :  STAGE
---------

Fikseerunud
	===>
	 Fikseerunud :  SPECIFICATION
---------

Insuff. cordis NYHA III.
	===>
	 NYHA III :  STAGE
---------

. Fibrillatio atriorum persistens  sanata ( EKV )
	===>
	 Fibrillat :  FIBRILLATION
	 persistens :  SPECIFICATION
---------

Funktsionaalne klass IV
	===>
	  IV :  STAGE
---------

Paroksüsmalis
	===>
	 Paroksüsm :  SPECIFICATION
---------

(dekompensatsioon --> NYHA III staadium)
	===>
	 dekompens :  SPECIFICATION
	 III staadium :  STAGE
---------

Äge seinaläbine [e transmuraalne] müokardi allseina infarkt, esmane, 24.02.2010, STEMI, trombolüüs
	===>
	 seinaläbine :  SPECIFICATION
	 STEMI :  STEMI
	 trombolü

In [13]:
not_tagged_rows.most_common()

[(('I11.9  Südamekahjustusega hüpertooniatõbi.', 'I00-I99'), 21),
 (('Anamneesis', 'I00-I99'), 20),
 (('VES', 'I00-I99'), 18),
 (('( mõõdukas )', 'I00-I99'), 15),
 (('-> sanata', 'I00-I99'), 14),
 (('C2a', 'I00-I99'), 13),
 (('RBBB', 'I00-I99'), 12),
 (('(-)', 'I00-I99'), 11),
 (('Insuff c/v chr', 'I00-I99'), 10),
 (('Varem läbipõetud', 'I00-I99'), 10),
 (('Mikroalbum +', 'I00-I99'), 10),
 (('-mõõdukas', 'I00-I99'), 10),
 (('(sanata)', 'I00-I99'), 9),
 (('Ventrikulaarne ekstrasüstoolia', 'I00-I99'), 9),
 (('Sves', 'I00-I99'), 9),
 (('(fixata)', 'I00-I99'), 9),
 (('( FA persist)', 'I00-I99'), 8),
 (('SVES', 'I00-I99'), 8),
 (('(min NLV lamenemine)', 'I00-I99'), 8),
 (('(marevaniseeritud, cordaron ravil)', 'I00-I99'), 7),
 (('VES+SVES', 'I00-I99'), 7),
 (('( mõõdukas)', 'I00-I99'), 7),
 (('( kerge)', 'I00-I99'), 7),
 (('Ventrikulaarsed', 'I00-I99'), 6),
 (('Risk -#', 'I00-I99'), 6),
 (('C2s', 'I00-I99'), 6),
 (('(korduvad)', 'I00-I99'), 6),
 (('( vasakul )', 'I00-I99'), 6),
 (('--pingutu

In [14]:
freqs = Counter()
for i in tqdm(not_tagged_texts):
    j = i.tag_layer(['words'])
    for k in j.words:
        freqs[k.text] += 1

100%|██████████| 936/936 [00:01<00:00, 651.43it/s]


In [17]:
with open("not_tagged_rows.csv", "w") as fout:
    writer = csv.writer(fout)
    for count in not_tagged_rows.most_common():
        writer.writerow([count[0][0], count[1]])

In [18]:
with open("not_tagged_word_freqs.csv", "w") as fout:
    writer = csv.writer(fout)
    for count in freqs.most_common():
        writer.writerow([count[0], count[1]])