In [30]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.util import ngrams
import re

### Load dataframe:

In [31]:
with open('../data_per_simpCode_00_clean_anamnesa.txt', 'rb') as f:
    lines = f.read().decode("utf-8", "backslashreplace").splitlines()
# with open('../data_per_simpCode_00_clean_status.txt', 'r') as f:
#     lines = f.read().splitlines()
with open('../data_per_simpCode_00_clean_status.txt', 'r') as f:
    lines = lines + f.read().splitlines()
df = pd.DataFrame(lines, dtype='string')

### Clean with regular expressions:

In [32]:
def replace_numbers(sentence):
    return re.sub('\d+', 'NUM', sentence)

def replace_dates(sentence):
    return re.sub('NUM.NUM.NUM', 'DATE', sentence)

def replace_float_numbers(sentence):
    return re.sub('NUM.NUM', 'NUM', sentence)

def remove_multiple_spaces(sentence):
    return re.sub('\s+', ' ', sentence)

def remove_multiple_tabs(sentence):
    return re.sub('\t+', ' ', sentence)

def add_space_before_numbers(sentence):
    return re.sub(r"(\w+)NUM", r"\1 NUM", sentence)

def add_space_after_numbers(sentence):
    return re.sub(r"NUM(\w+)", r"NUM \1", sentence)

def remove_question_marks(sentence):
    return re.sub('\?', ' ', sentence)

def remove_exclamation_marks(sentence):
    return re.sub('\!', ' ', sentence)

def remove_semicolons(sentence):
    return re.sub(';', ' ', sentence)

def remove_colons(sentence):
    return re.sub(':', ' ', sentence)

def remove_commas(sentence):
    return re.sub('\,', ' ', sentence)

def remove_slashes(sentence):
    return re.sub('\/+', ' ', sentence)

def remove_backslashes(sentence):
    return re.sub('\\\\+', ' ', sentence)

def remove_plus_signs(sentence):
    return re.sub('\+', ' ', sentence)

def remove_equal_signs_not_surrounded_by_words(sentence):
    return re.sub(r"([^\wа-яА-Я]*)\=([\wа-яА-Я])", r"\1 \2", sentence)

def remove_underscores(sentence):
    return re.sub('\_', ' ', sentence)

def replace_invalid_dash(sentence):
    return re.sub(r'[\–\—]', '-', sentence)

def replace_multiple_dashes(sentence):
    return re.sub('-{2,}', ' - ', sentence)

def replace_multiple_dots(sentence):
    return re.sub('\.{2,}', '.', sentence)

def remove_dashes_followed_by_space(sentence):
    return re.sub(r"(.)-\s", r"\1 ", sentence)

def remove_dashes_following_a_space(sentence):
    return re.sub(r"\s-(.)", r" \1", sentence)

def remove_negative_examination(sentence):
    return re.sub(r"(?:\(\-\)|\{\-\})", ' ', sentence)

def remove_starting_special_characters(sentence):
    return re.sub('^[\.\?\-,]+', '', sentence)

def remove_ending_special_characters(sentence):
    return re.sub('[\.\?\-,]+$', '', sentence)

def remove_dots(sentence):
    return re.sub('[\.]+', ' ', sentence)

def remove_quotation_marks(sentence):
    return re.sub('[\`\'\"\“\”\„]+', ' ', sentence)

def remove_dot_before_word(sentence):
    return re.sub(r"([^\wа-яА-Я])\.([\wа-яА-Я])", r"\1 \2", sentence)

def remove_dot_followed_by_dash(sentence):
    return re.sub(r"\.\-", '. ', sentence)

def space_out_dot_before_number(sentence):
    return re.sub(r"[\.][\-=]NUM", '. NUM', sentence)

def split_words_with_capital_after_lowercase(sentence):
    return re.sub(r"([а-яa-z])([А-ЯA-Z])", r"\1 \2", sentence)

def remove_comparison_operators(sentence):
    return re.sub(r"(?:\&lt|\&gt|\>|\<)", ' ', sentence)

def remove_misc_special_characters(sentence):
    return re.sub(r"[\~§@\*\|\#\&\{\}\$]", ' ', sentence)


df['clean_sentence'] = df[0].apply(remove_semicolons)
df['clean_sentence'] = df['clean_sentence'].apply(remove_question_marks)
df['clean_sentence'] = df['clean_sentence'].apply(remove_exclamation_marks)
df['clean_sentence'] = df['clean_sentence'].apply(remove_colons)
df['clean_sentence'] = df['clean_sentence'].apply(remove_commas)
df['clean_sentence'] = df['clean_sentence'].apply(remove_slashes)
df['clean_sentence'] = df['clean_sentence'].apply(remove_backslashes)
df['clean_sentence'] = df['clean_sentence'].apply(remove_plus_signs)
df['clean_sentence'] = df['clean_sentence'].apply(remove_equal_signs_not_surrounded_by_words)
df['clean_sentence'] = df['clean_sentence'].apply(remove_underscores)
df['clean_sentence'] = df['clean_sentence'].apply(replace_invalid_dash)
df['clean_sentence'] = df['clean_sentence'].apply(replace_multiple_dashes)
df['clean_sentence'] = df['clean_sentence'].apply(replace_multiple_dots)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_followed_by_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_following_a_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_negative_examination)
df['clean_sentence'] = df['clean_sentence'].apply(remove_starting_special_characters)
df['clean_sentence'] = df['clean_sentence'].apply(remove_ending_special_characters)
df['clean_sentence'] = df['clean_sentence'].apply(replace_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(replace_dates)
df['clean_sentence'] = df['clean_sentence'].apply(replace_float_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(add_space_before_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(add_space_after_numbers)
df['clean_sentence'] = df['clean_sentence'].apply(space_out_dot_before_number)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dot_before_word)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dot_followed_by_dash)
df['clean_sentence'] = df['clean_sentence'].apply(split_words_with_capital_after_lowercase)
df['clean_sentence'] = df['clean_sentence'].apply(remove_comparison_operators)
df['clean_sentence'] = df['clean_sentence'].apply(remove_misc_special_characters)
# df['clean_sentence'] = df['clean_sentence'].apply(remove_dots) # commetnt this line
df['clean_sentence'] = df['clean_sentence'].apply(remove_quotation_marks)
df['clean_sentence'] = df['clean_sentence'].apply(remove_multiple_tabs)
df['clean_sentence'] = df['clean_sentence'].apply(remove_multiple_spaces)

### Special cases cleaning:

In [33]:
def remove_dash_after_special_words(sentence):
    special_words = [
        'ан', 'апарат', 'бми', 'вени', 'възли', 'глава', 'гърло', 'далак', 'дейност', 'дроб', 'език',
        'жлеза', 'жлези', 'зрение', 'кожа', 'корем', 'крайници', 'нос', 'ода', 'одс', 
        'простата', 'пулмо', 'пулс', 'рефлекси', 'с-ма', 'слезка', 'сливици', 
        'слух', 'статус', 'сърце', 'състояние',
        'тегло', 'температура', 'тонзили', 'шия', 'чмн', 'bmi', 'rr']
    for word in special_words:
        sentence = re.sub(f"({word})-", r"\1 ", sentence, flags=re.IGNORECASE)
    return sentence

def replace_special_expressions(sentence):
    special_expressions = [
        ['V O S', 'VOS'],
        ['V O D', 'VOD'],
        [r"([^\wа-яА-Я])(?:рр|РР|Рр|rr|PP)", r"\1 RR"],
        [r"(?:рр|РР|Рр|rr|PP)([^\wа-яА-Я])", r"RR \1"],
        [r"([^\wа-яА-Я]*)(?:кор|Кор|КОР|kor|Кор)([^\wа-яА-Я])", r"\1 Cor "],
        [r"(?:б\.о\.|[^\wа-яА-Я]бо\.|\sБО\s|\sБо\s|\sбо\s|\sб\sо\s|\sБ\.*\sО\s|\sБ\.*О\.*\s)", ' б.о. '],
        [r"(?:РСД|р\.с\.д\.|рсд|PCД)", ' РСД '],
        ['ССС', ' ССС '],
        ['\.NUM', '. NUM']
    ]
    for find, replace in special_expressions:
        sentence = re.sub(find, replace, sentence)
    return sentence

df['clean_sentence'] = df['clean_sentence'].apply(remove_dash_after_special_words)
df['clean_sentence'] = df['clean_sentence'].apply(replace_special_expressions)

df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_followed_by_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dashes_following_a_space)
df['clean_sentence'] = df['clean_sentence'].apply(remove_dot_before_word)
df['clean_sentence'] = df['clean_sentence'].apply(remove_multiple_spaces)

### Tokenize:

In [34]:
# save cleaned data so we know what tokenizer works on
df['clean_sentence'].to_csv('sentence_cleaned_both.csv')

discarded_tokens = ['.', 'NUM', 'DATE']

unigrams = {}
bigrams = {}
for sentence in df['clean_sentence']:
    tokens = word_tokenize(sentence)
    tokens = list(filter(lambda token: token not in discarded_tokens, tokens))

    #get unigrams
    for token in tokens:
        if token not in unigrams:
            unigrams[token] = 0
        unigrams[token] += 1

    #get bigrams
    bigram_list = list(ngrams(tokens, 2))
    for bigram in bigram_list:
        if bigram not in bigrams:
            bigrams[bigram] = 0
        bigrams[bigram] += 1

unigrams = sorted(unigrams.items(), key=lambda x: x[1], reverse=True)
bigrams = sorted(bigrams.items(), key=lambda x: x[1], reverse=True)
df_bigrams = pd.DataFrame([(i[0], i[1], j) for i, j in bigrams], columns =['First', 'Second', 'Count'])
df_bigrams.to_csv('bigrams.csv')
print('====Unigrams length====')
print(len(unigrams))
print('====First 10 unigrams====')
print(unigrams[0:10])
print('====Bigrams length====')
print(len(bigrams))
print('====First 10 bigrams====')
print(bigrams[0:10])

====Unigrams length====
43291
====First 10 unigrams====
[('и', 31679), ('на', 27252), ('без', 27204), ('б.о', 22399), ('RR', 22049), ('Корем', 17239), ('Cor', 16872), ('в', 16477), ('дишане', 14264), ('мек', 12803)]
====Bigrams length====
156892
====First 10 bigrams====
[(('Корем', 'мек'), 9189), (('без', 'отоци'), 7400), (('везикуларно', 'дишане'), 7013), (('Cor', 'РСД'), 6939), (('сърдечна', 'дейност'), 6613), (('ясни', 'тонове'), 6034), (('не', 'се'), 5940), (('RR', 'Корем'), 4462), (('се', 'палпират'), 4303), (('дишане', 'без'), 4297)]


### Collocations extraction:

In [35]:
def get_collocations(bigrams, freq_threshold=1):
    collocations = []

    bigram_map_first_word = {}
    bigram_map_second_word = {}
    for bigram, _ in bigrams:
        if bigram[0] not in bigram_map_first_word:
            bigram_map_first_word[bigram[0]] = 0
        bigram_map_first_word[bigram[0]] += 1
        if bigram[1] not in bigram_map_second_word:
            bigram_map_second_word[bigram[1]] = 0
        bigram_map_second_word[bigram[1]] += 1

    for bigram, freq in bigrams:
        if freq < freq_threshold:
            continue
        if bigram_map_first_word[bigram[0]] == 1 or bigram_map_second_word[bigram[1]] == 1:
            collocations.append(bigram)

    return collocations

collocations = get_collocations(bigrams, 5)
df_collocations = pd.DataFrame([(i, j) for i, j in collocations], columns=['First', 'Second'])
df_collocations.to_csv('collocations.csv')
print('====Collocations length====')
print(len(collocations))
print('====First 100 collocations====')
print(collocations[0:100])

====Collocations length====
2653
====First 100 collocations====
[('ПИКОЧО-ПОЛОВО', 'С-МА'), ('Артериално', 'налягане'), ('(', 'Score'), ('Score', ')'), ('Съдечен', 'статус'), ('Корем', 'меко-еластичен'), ('везиколарно', 'дишане'), ('сърд', 'гран'), ('Явява', 'се'), ('Ч.ДР', 'И'), ('на', 'артериите'), ('на', 'гръд.кош'), ('смущ', 'в'), ('без', 'доб'), ('доб', 'хрипове'), ('Авто', 'и'), ('С-Фр', 'у'), ('рефлекси', 'Отпадна'), ('условията', 'на'), ('РСД', 'яс'), ('яс', 'тон'), ('сърд', 'гран.в'), ('гран.в', 'норма'), ('се', 'постигне'), ('НА', 'ВЪРХА'), ('Цветно', 'зрение'), ('на', 'Аорта'), ('на', 'извънболничната'), ('извънболничната', 'помощ'), ('тъй', 'като'), ('Лечебната', 'цел'), ('Височина', 'см'), ('Об.талия', 'см'), ('Нв', 'Корем'), ('мек', 'респиратнорно'), ('респиратнорно', 'подвижен'), ('да', 'бъда'), ('да', 'осигури'), ('да', 'заплати'), ('заплати', 'при'), ('при', 'назначаването'), ('назначаването', 'им'), ('за', 'ХСМ.Мек'), ('ХСМ.Мек', 'Физ'), ('без', 'хроп'), ('хроп', 'нах

### Export unigrams to table

In [36]:
export_df = pd.DataFrame(unigrams, columns=['Word', 'Count'])

export_df.to_excel('unigrams.xlsx')
export_df.to_csv('unigrams.csv')

### Most common mistakes

In [37]:
from nltk.metrics.distance import jaccard_distance

top_unigrams_count = round(len(unigrams) * 0.03)
top_unigrams = unigrams[0:top_unigrams_count]

top_mistakes = []

for unigram, _ in top_unigrams:
    if len(unigram) < 3: continue
    mistakes = { unigram: [] }
    for other_unigram, _ in unigrams:
        if unigram.lower() == other_unigram.lower(): continue
        distance = jaccard_distance(set(unigram), set(other_unigram))
        if distance < 0.15:
            mistakes[unigram].append(other_unigram)
    top_mistakes.append(mistakes)

print('====First 50 unigram mistakes====')
print(top_mistakes[0:50])

====First 50 unigram mistakes====
[{'без': ['безз', 'безе', 'ебез']}, {'б.о': ['б.о.', 'б.оо', 'бб.о']}, {'Корем': ['Корме']}, {'Cor': []}, {'дишане': ['вдишване', 'дишанес', 'дишанел', 'издишане', 'дишаен', 'диишане', 'диашне', 'дишване', 'дишишане', 'дишанне', 'диша-не', 'деишане', 'дишанеи', 'дишавне', 'дишание', 'иьдишане', 'дишане.', 'дишанеRR', 'дишанев', 'дишанееедри', 'дишанеез', 'дзишане', 'идишване', 'дишарне', 'дишанене', 'дишаяне', 'издишнане']}, {'мек': []}, {'РСД': ['РДС', 'СРД', 'ДРС']}, {'тонове': ['тоновете', 'тоонве', 'тононве', 'товнове', 'тоонове', 'тонве', 'ттонове', 'тнове', 'тоновет', 'тоннове', 'тоновве', 'тонвое', 'тонтове', 'тооннове']}, {'отоци': ['оттоци', 'отци', 'тоци', 'отоиц', 'оитоци', 'отооци', 'ототци']}, {'дейност': ['дейнос', 'с.дейност', 'деност', 'дейности', 'дейост', 'дейност.', 'дейнот', 'дейносст', 'дейноскт', 'недост', 'десното', 'деййност', 'дейнст', 'дейножст', 'деайност', 'дейниост', 'дейностс', 'десност', 'дейнсот', 'дейностq', 'дейностг',