In [14]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.util import ngrams
import re

### Load dataframe:

In [15]:
with open('../data_per_simpCode_00_clean_anamnesa.txt', 'rb') as f:
    lines = f.read().decode("utf-8", "backslashreplace").splitlines()
df = pd.DataFrame(lines, dtype='string')

### Clean with regular expressions:

In [16]:
def replace_numbers(sentence):
    return re.sub('\d+', 'NUM', sentence)

def replace_dates(sentence):
    return re.sub('NUM.NUM.NUM', 'DATE', sentence)

def replace_float_numbers(sentence):
    return re.sub('NUM.NUM', 'NUM', sentence)

def remove_multiple_spaces(sentence):
    return re.sub('\s+', ' ', sentence)

def remove_multiple_tabs(sentence):
    return re.sub('\t+', ' ', sentence)

def space_out_semicolon(sentence):
    return re.sub(';', ' ; ', sentence)

def add_space_before_numbers(sentence):
    return re.sub(r"(\w+)NUM", r"\1 NUM", sentence)

def add_space_after_numbers(sentence):
    return re.sub(r"NUM(\w+)", r"NUM \1", sentence)

def remove_semicolons(sentence):
    return re.sub(';', ' ', sentence)

def remove_colons(sentence):
    return re.sub(':', ' ', sentence)

def remove_commas(sentence):
    return re.sub('\,', ' ', sentence)

def remove_slashes(sentence):
    return re.sub('\/+', ' ', sentence)

def remove_backslashes(sentence):
    return re.sub('\\\\+', ' ', sentence)

def remove_plus_signs(sentence):
    return re.sub('\+', ' ', sentence)

def remove_underscores(sentence):
    return re.sub('\_', ' ', sentence)

def replace_invalid_dash(sentence):
    return re.sub('–', '-', sentence)

def replace_multiple_dashes(sentence):
    return re.sub('-{2,}', ' - ', sentence)

def replace_multiple_dots(sentence):
    return re.sub('\.{2,}', '.', sentence)

def remove_dashes_followed_by_space(sentence):
    return re.sub(r"(.)-\s", r"\1 ", sentence)

def remove_dashes_following_a_space(sentence):
    return re.sub(r"\s-(.)", r" \1", sentence)

def remove_negative_examination(sentence):
    return re.sub('\(-\)', ' ', sentence)

def remove_starting_special_characters(sentence):
    return re.sub('^[\.\?\-,]+', '', sentence)

def remove_ending_special_characters(sentence):
    return re.sub('[\.\?\-,]+$', '', sentence)

def remove_dots(sentence):
    return re.sub('[\.]+', ' ', sentence)

def remove_quotation_marks(sentence):
    return re.sub('[\`\'\"]+', ' ', sentence)

def remove_dot_before_word(sentence):
    return re.sub(r"([^\wа-яА-Я])\.([\wа-яА-Я])", r"\1 \2", sentence)

def remove_dot_followed_by_dash(sentence):
    return re.sub(r"\.\-", '. ', sentence)

def space_out_dot_before_number(sentence):
    return re.sub(r"[\.][\-=]NUM", '. NUM', sentence)

def split_words_with_capital_after_lowercase(sentence):
    return re.sub(r"([а-яa-z])([А-ЯA-Z])", r"\1 \2", sentence)


df['clean_sentence'] = df[0].apply(remove_semicolons)
df['clean_sentence'] = df['clean_sentence'].apply(remove_colons)
df['clean_sentence'] = df['clean_sentence'].apply(remove_commas)
df['clean_sentence'] = df['clean_sentence'].apply(remove_slashes)
df['clean_sentence'] = df['clean_sentence'].apply(remove_backslashes)
df['clean_sentence'] = df['clean_sentence'].apply(remove_plus_signs)
df['clean_sentence'] = df['clean_sentence'].apply(remove_underscores)
df['clean_sentence'] = df['clean_sentence'].apply(replace_invalid_dash)
df['clean_sentence'] = df['clean_sentence'].apply(replace_multiple_dashes)
df['clean_sentence'] = df['clean_sentence'].apply(replace_multiple_dots)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_followed_by_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_following_a_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_negative_examination)
df['clean_sentence'] = df['clean_sentence'].apply(remove_starting_special_characters)
df['clean_sentence'] = df['clean_sentence'].apply(remove_ending_special_characters)
df['clean_sentence'] = df['clean_sentence'].apply(replace_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(replace_dates)
df['clean_sentence'] = df['clean_sentence'].apply(replace_float_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(add_space_before_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(add_space_after_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(space_out_dot_before_number)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dot_before_word)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dot_followed_by_dash)
df['clean_sentence'] = df['clean_sentence'].apply(split_words_with_capital_after_lowercase)
# df['clean_sentence'] = df['clean_sentence'].apply(remove_dots) # commetnt this line
# df['clean_sentence'] = df['clean_sentence'].apply(space_out_semicolon)
df['clean_sentence'] = df['clean_sentence'].apply(remove_quotation_marks)
df['clean_sentence'] = df['clean_sentence'].apply(remove_multiple_tabs)
df['clean_sentence'] = df['clean_sentence'].apply(remove_multiple_spaces)

### Special cases cleaning:

In [17]:
def remove_dash_after_special_words(sentence):
    special_words = [
        'апарат', 'вени', 'възли', 'глава', 'гърло', 'далак', 'дейност', 'дроб', 'език',
        'жлеза', 'жлези', 'зрение', 'кожа', 'корем', 'крайници', 'нос', 'ода', 'одс', 
        'простата', 'пулмо', 'пулс', 'рефлекси', 'с-ма', 'слезка', 'сливици', 
        'слух', 'статус', 'сърце', 'състояние',
        'тегло', 'температура', 'тонзили', 'шия', 'чмн', 'rr']
    for word in special_words:
        sentence = re.sub(f"({word})-", r"\1 ", sentence, flags=re.IGNORECASE)
    return sentence

def replace_special_expressions(sentence):
    special_expressions = [
        ['V O S', 'VOS'],
        ['V O D', 'VOD'],
        [r"([^\wа-яА-Я])(?:рр|РР|Рр|rr|PP)", r"\1 RR"],
        [r"(?:рр|РР|Рр|rr|PP)([^\wа-яА-Я])", r"RR \1"],
        [r"([^\wа-яА-Я]*)(?:кор|Кор|КОР|kor|Кор)([^\wа-яА-Я])", r"\1 Cor "],
        [r"(?:б\.о\.|\sБО\s|\sБо\s|\sбо\s|\sб\sо\s|\sБ\sО\s)", ' б.о. '],
        [r"(?:РСД|р\.с\.д\.|рсд|PCД)", ' РСД '],
        ['ССС', ' ССС '],
    ]
    for find, replace in special_expressions:
        sentence = re.sub(find, replace, sentence)
    return sentence

df['clean_sentence'] = df['clean_sentence'].apply(remove_dash_after_special_words)
df['clean_sentence'] = df['clean_sentence'].apply(replace_special_expressions)

df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_followed_by_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_following_a_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dot_before_word)
df['clean_sentence'] = df['clean_sentence'].apply(remove_multiple_spaces)

### Tokenize:

In [18]:
# save cleaned data so we know what tokenizer works on
df['clean_sentence'].to_csv('sentence_cleaned_anamnesa.csv')

discarded_tokens = ['.', 'NUM', 'DATE']

unigrams = {}
bigrams = {}
for sentence in df['clean_sentence']:
    tokens = word_tokenize(sentence)
    tokens = list(filter(lambda token: token not in discarded_tokens, tokens))

    #get unigrams
    for token in tokens:
        if token not in unigrams:
            unigrams[token] = 0
        unigrams[token] += 1

    #get bigrams
    bigram_list = list(ngrams(tokens, 2))
    for bigram in bigram_list:
        if bigram not in bigrams:
            bigrams[bigram] = 0
        bigrams[bigram] += 1

unigrams = sorted(unigrams.items(), key=lambda x: x[1], reverse=True)
bigrams = sorted(bigrams.items(), key=lambda x: x[1], reverse=True)
print('====Unigrams length====')
print(len(unigrams))
print('====First 50 unigrams====')
print(unigrams[0:50])
print('====First 50 bigrams====')
print(bigrams[0:50])

====Unigrams length====
8722
====First 50 unigrams====
[('на', 3807), ('в', 2078), ('от', 2005), ('се', 1929), ('за', 1880), ('и', 1871), ('с', 1502), ('по', 832), ('е', 739), ('Оплаква', 604), ('лечение', 549), ('болки', 533), ('повод', 468), ('при', 435), ('терапия', 431), ('преглед', 403), ('да', 371), ('болка', 314), ('не', 309), ('оплаквания', 305), ('От', 298), ('С', 296), ('след', 269), ('изписване', 266), ('За', 241), ('главоболие', 229), ('години', 215), ('има', 206), ('След', 197), ('оплаква', 197), ('редовно', 196), ('без', 196), ('лекарства', 193), ('г.', 191), ('отпадналост', 185), ('карцином', 185), ('със', 183), ('химиотерапия', 179), ('Са', 177), ('кашлица', 176), ('контрол', 175), ('корема', 172), ('контролен', 167), ('област', 166), ('лесна', 163), ('хоспитализация', 162), ('На', 162), ('умора', 162), ('изследвания', 160), ('хипертония', 158)]
====First 50 bigrams====
[(('се', 'от'), 756), (('Оплаква', 'се'), 597), (('по', 'повод'), 415), (('болки', 'в'), 412), (('се'

### Collocations extraction:

In [19]:
def get_collocations(bigrams, freq_threshold=1):
    collocations = []

    bigram_map_first_word = {}
    bigram_map_second_word = {}
    for bigram, _ in bigrams:
        if bigram[0] not in bigram_map_first_word:
            bigram_map_first_word[bigram[0]] = 0
        bigram_map_first_word[bigram[0]] += 1
        if bigram[1] not in bigram_map_second_word:
            bigram_map_second_word[bigram[1]] = 0
        bigram_map_second_word[bigram[1]] += 1

    for bigram, freq in bigrams:
        if freq < freq_threshold:
            continue
        if bigram_map_first_word[bigram[0]] == 1 or bigram_map_second_word[bigram[1]] == 1:
            collocations.append(bigram)

    return collocations

collocations = get_collocations(bigrams, 5)

print('====Collocations length====')
print(len(collocations))
print('====First 50 collocations====')
print(collocations[0:50])

====Collocations length====
341
====First 50 collocations====
[('може', 'да'), ('Явява', 'се'), ('условията', 'на'), ('на', 'извънболничната'), ('извънболничната', 'помощ'), ('се', 'постигне'), ('тъй', 'като'), ('да', 'заплати'), ('заплати', 'при'), ('при', 'назначаването'), ('назначаването', 'им'), ('с', 'бланка'), ('Лечебната', 'цел'), ('да', 'бъда'), ('бъда', 'диспансеризиран'), ('да', 'осигури'), ('осигури', 'достатъчен'), ('достатъчен', 'РС'), ('РС', 'а'), ('Не', 'желая'), ('желая', 'да'), ('за', 'преписване'), ('преписване', 'на'), ('шум', 'в'), ('млечна', 'жлеза'), ('с', 'оглед'), ('взет', 'на'), ('е', 'съгласен'), ('съгласен', 'да'), ('на', 'Диспансерно'), ('начин', 'на'), ('провеждане', 'на'), ('назначената', 'терапия'), ('по', 'хода'), ('издаване', 'на'), ('Административно', 'посещение'), ('Води', 'се'), ('наличие', 'на'), ('Преглед', 'за'), ('на', 'Артериалното'), ('подд.мес', 'терапия'), ('рискови', 'фактори'), ('при', 'обичайни'), ('сухота', 'в'), ('направление', 'за'), ('

### Most common mistakes

In [20]:
from nltk.metrics.distance import jaccard_distance

top_unigrams_count = round(len(unigrams) * 0.03)
top_unigrams = unigrams[0:top_unigrams_count]

top_mistakes = []

for unigram, _ in top_unigrams:
    if len(unigram) < 3: continue
    mistakes = { unigram: [] }
    for other_unigram, _ in unigrams:
        distance = jaccard_distance(set(unigram), set(other_unigram))
        if distance < 0.15 and unigram != other_unigram:
            mistakes[unigram].append(other_unigram)
    top_mistakes.append(mistakes)

print('====First 50 unigram mistakes====')
print(top_mistakes[0:50])

====First 50 unigram mistakes====
[{'Оплаква': []}, {'лечение': ['лечени', 'лечиние', 'личен']}, {'болки': ['иболки', 'блоки', 'облки', 'боилки']}, {'повод': ['повоод']}, {'при': ['прии']}, {'терапия': ['терапията', 'артерия', 'приета', 'теряапия', 'терарпия', 'Терапията', 'тераипята', 'периартри', 'препарати', 'терапи', 'теряапи', 'неприятна', 'периартрит']}, {'преглед': ['прегледа', 'прегледи', 'преглд']}, {'болка': ['облак', 'болака', 'болак']}, {'оплаквания': ['оплакванията', 'оплакваниея', 'плаквания', 'оплаквния', 'аплаквания', 'оплакваяния', 'оплакавания', 'оплакваниья', 'оплаквани', 'оплакванишя']}, {'след': []}, {'изписване': ['изсписване', 'исписване', 'изпислване', 'из-писване', 'изсипване', 'изписане', 'заспиване', 'неспазване', 'Заизписване', 'Изписване']}, {'главоболие': ['главоболието', 'главобалие', 'главоблие', 'главололие']}, {'години': ['годинин']}, {'има': []}, {'След': []}, {'оплаква': ['оплакава', 'оплакваа', 'оплакваот']}, {'редовно': ['проведено', 'Проведено', '