In [4]:
%pip install pandas
%pip install spacy
%pip install nltk

from nltk.corpus import wordnet
import spacy
import os
import pandas as pd
import spacy
from collections import defaultdict
import time
import csv
import unicodedata

PATH = "word_indexes"
DEBUG = False

# Ensure the spaCy model is downloaded
if not spacy.util.is_package("es_core_news_lg"):
    os.system("python -m spacy download es_core_news_lg")

nlp = spacy.load("es_core_news_lg")

os.makedirs(PATH, exist_ok=True)

def d_print(*args, **kwargs):
    if not DEBUG:
        print(*args, **kwargs)

def normalize_key(key):
    return ''.join(c for c in unicodedata.normalize('NFD', key) if unicodedata.category(c) != 'Mn')

def normalize_token(token, lemma):
    if token.pos_ in ['ADJ', 'VERB'] and lemma.endswith('a'):
        lemma = lemma[:-1].lower() + 'o'
    return lemma

def replace_substrings(description):
    replacements = {
        '(s)': '',
        '(': '',
        ')': '',
        '[': '',
        ']': '',
        'de tipo': 'neom',
        'no especificado': 'neom',
        'no especificada': 'neom',
        'neoms': 'neom',
        'debido a': '',
        'debida a': '',
        'asociado a': '',
        'asociada a': '',
        'neom': ''
    }

    # Apply all replacements in one pass
    for old, new in replacements.items():
        description = description.replace(old, new)

    return description



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Sort by lemma, add codes as set

In [5]:
import multiprocessing as mp
from tqdm import tqdm


csv_file_path = '../csv_import_scripts/cie10-es-diagnoses.csv'

# Read CSV file using pandas
start_time = time.time()
df_pandas = pd.read_csv(csv_file_path)

# Initialize dictionary to store words and associated codes
word_index_pandas = defaultdict(set)

# Process each row in the dataframe using pandas
start_time = time.time()
processed_lemmas = set()

def process_row(row_data):
    """Process a single row"""
    code, description = row_data
    word_index = defaultdict(set)
    doc = nlp(description)
    for token in doc:
        if not token.is_stop and token.is_alpha:
            lemma = token.lemma_.lower()
            lemma = normalize_token(token, lemma)

            word_index[lemma].add(code)
            if token.text.lower() != lemma.lower():
                word_index[token.text.lower()].add(code)
    return word_index

def parallel_process_dataframe(df):
    """Process DataFrame in parallel"""
    row_data = list(zip(df['code'], df['description']))
    n_cores = mp.cpu_count()
    d_print(f"Using {n_cores} CPU cores")

    word_index = defaultdict(set)
    with mp.Pool(n_cores) as pool:
        results = list(tqdm(
            pool.imap(process_row, row_data),
            total=len(row_data),
            desc="Processing records"
        ))

    # Combine results
    for result in results:
        for key, value in result.items():
            word_index[key].update(value)

    return word_index

# Replace original loop with parallel version
word_index_by_lemmas = parallel_process_dataframe(df_pandas)

pandas_processing_time = time.time() - start_time

def save_word_index_to_csv(word_index):
    file_path = f'{PATH}/word_index_by_lemmas.csv'
    with open(file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Sort the word_index items by key
        for key in sorted(word_index.keys()):
            # Sort the values for each key
            sorted_values = sorted(word_index[key])
            # Remove accents from keys before writing
            writer.writerow([normalize_key(key)] + [';'.join(sorted_values)])

save_word_index_to_csv(word_index_by_lemmas)

# d_print the time taken for each operation
d_print(f"Pandas processing time: {pandas_processing_time:.4f} seconds")

Using 24 CPU cores


Processing records: 100%|██████████| 101246/101246 [00:23<00:00, 4250.03it/s]


Pandas processing time: 24.9791 seconds


In [6]:
import sys
import csv

def load_lemma_index_from_csv():
    file_path = f'{PATH}/word_index_by_lemmas.csv'
    word_index = defaultdict(set)
    csv.field_size_limit(sys.maxsize)
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        for row in reader:
            key = row[0]
            values = set(row[1].split(';'))
            word_index[key].update(values)
    return word_index


# Function to query the dictionary
def query_string_by_lemmas(query):
    # Memoization: Check if the word_index_by_lemmas is already loaded, if not, load it
    word_index_by_lemmas = globals().get('word_index_by_lemmas')
    if word_index_by_lemmas is None:
        word_index_by_lemmas = load_lemma_index_from_csv()
        d_print("Loaded word index:", len(word_index_by_lemmas))

    query_doc = nlp(query)
    result = set()
    token_results = []
    for token in query_doc:
        if not token.is_stop and token.is_alpha:
            lemma = token.lemma_.lower()
            text = token.text.lower()

            if normalize_key(normalize_token(token, lemma)) in word_index_by_lemmas:
                token_results.append(word_index_by_lemmas[lemma])
                d_print(f"Word '{lemma}' <- {token} found in the index -> {word_index_by_lemmas[lemma]}")
            elif lemma in word_index_by_lemmas:
                token_results.append(word_index_by_lemmas[lemma])
                d_print(f"Word '{lemma}' <- {token} found in the index -> {word_index_by_lemmas[lemma]}")
            elif text in word_index_by_lemmas:
                token_results.append(word_index_by_lemmas[text])
                d_print(f"Word '{token.text}' found in the index -> {word_index_by_lemmas[token.text]}")
            else:
                d_print(f"Word '{lemma}' not found in the index")
                found_synonym = False

                # Collect synonyms in Spanish
                synonyms = set()
                for syn in wordnet.synsets(lemma, lang='spa'):
                    for l in syn.lemmas('spa'):
                        synonyms.add(l.name().lower())

                d_print(synonyms)

                for syn in synonyms:
                    if syn in word_index_by_lemmas:
                        token_results.append(word_index_by_lemmas[syn])
                        found_synonym = True
                        break
                if not found_synonym:
                    d_print(f"Word '{lemma}' not found in the index even after checking synonyms")

    # Calculate intersection of all results
    if token_results:
        result = set.intersection(*token_results)
    else:
        result = set()
    return sorted(result)


query = "fractura en cadera"

print("Common codes for the query:", query_string_by_lemmas(query))


Word 'fractura' <- fractura found in the index -> {'S52.231N', 'S99.222D', 'S52.336N', 'S72.351D', 'M84.36', 'S02.849K', 'S49.031G', 'S49.109G', 'M84.622S', 'S72.124A', 'S82.033J', 'S52.233J', 'S82.65XS', 'S82.102J', 'S52.271H', 'S72.462E', 'S82.09', 'S42.211P', 'S92.331A', 'S42.312S', 'S72.409J', 'S32.613A', 'S42.214G', 'S92.302B', 'S92.422B', 'S92.331D', 'S99.031K', 'S72.466N', 'S72.434K', 'S99.119D', 'S72.465F', 'S82.035E', 'S12.64XB', 'S89.111D', 'S99.111', 'S72.043M', 'S42.354P', 'S92.424S', 'S82.121C', 'S59.041D', 'S52.379K', 'S72.001G', 'S62.610B', 'S32.481A', 'S32.476', 'S82.451', 'S72.031R', 'S02.32XK', 'S92.201B', 'S02.80XB', 'S92.121K', 'S92.345B', 'S92.052G', 'S92.066D', 'S72.033D', 'S82.126K', 'S52.301F', 'S42.122D', 'S72.091P', 'S82.256G', 'S32.422S', 'S42.014K', 'M84.312P', 'S32.422B', 'S52.266', 'S72.21', 'S02.122B', 'S72.134E', 'S82.892A', 'M84.521', 'S82.492', 'S89.029', 'S52.20', 'S92.526A', 'S72.309E', 'S82.462B', 'S52.043Q', 'S82.134K', 'S72.471D', 'M80.041D', 'S82

# Sort by code, add lemmas as set

In [7]:

# save code as key and lemmas as sets
csv_file_path = '../csv_import_scripts/cie10-es-diagnoses.csv'

# Read CSV file using pandas
start_time = time.time()
df_pandas = pd.read_csv(csv_file_path)

# Initialize dictionary to store words and associated codes
word_index_by_codes = defaultdict(list)

# Process each row in the dataframe using pandas
import multiprocessing as mp
from tqdm import tqdm

def process_row(row_data):
    """Procesa una fila individual del DataFrame"""
    code, unprocessed_description = row_data
    description = replace_substrings(unprocessed_description.lower())
    #if unprocessed_description != description:
    #    d_print(f"***{unprocessed_description} -> {description}***")
    lemmas = []

    for subdescription in description.split("|"):
        subdescription = subdescription.strip()
        doc = nlp(subdescription)
        sublemmas = []
        for token in doc:
            # Mirar si alguna entidad es util
            #if not token.is_stop and not token.is_alpha and token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "PROPN":
                #d_print(f"**{code}-{token.lemma_.lower()}->{token.pos_}**")
            if not token.is_stop and token.is_alpha or token.pos_ == "PROPN" or token.pos_ == "NOUN":
                lemma = token.lemma_.lower()
                #d_print(lemma) if "arboviru" in subdescription else None
                lemma = normalize_token(token, lemma)
                #d_print(lemma) if "arboviru" in subdescription else None
                lemma = normalize_key(lemma)
                #d_print(lemma) if "arboviru" in subdescription else None
                sublemmas.append(lemma)

        lemmas.append(set(sorted(sublemmas)))
    return code, lemmas

def parallel_process_dataframe(df):
    """Procesa el DataFrame en paralelo"""
    # Preparar los datos para procesamiento paralelo
    row_data = list(zip(df['code'], df['description']))

    # Inicializar el diccionario compartido
    word_index_by_codes = defaultdict(list)

    # Configurar el número de procesos
    n_cores = mp.cpu_count()
    d_print(f"Utilizando {n_cores} núcleos CPU")

    # Procesar en paralelo con barra de progreso
    with mp.Pool(n_cores) as pool:
        results = list(tqdm(
            pool.imap(process_row, row_data),
            total=len(row_data),
            desc="Procesando registros"
        ))

    # Combinar resultados
    for code, lemmas in results:
        word_index_by_codes[code] = lemmas

    return word_index_by_codes

# Reemplazar el bucle original con la versión paralela
start_time = time.time()
word_index_by_codes = parallel_process_dataframe(df_pandas)
pandas_processing_time = time.time() - start_time

# d_print the time taken for each operation
d_print(f"Pandas processing time: {pandas_processing_time:.4f} seconds")

#cadero cadera
#gluteo glutea
#metatarsiano

Utilizando 24 núcleos CPU


Procesando registros: 100%|██████████| 101246/101246 [00:24<00:00, 4052.39it/s]

Pandas processing time: 25.6728 seconds





In [8]:
def save_word_index_to_csv(word_index):
    file_path = f'{PATH}/word_index_by_codes.csv'
    with open(file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Sort the word_index items by key
        for index, key in enumerate(word_index.keys()):
            unique_sorted_values = word_index[key]
            # Sort the values for each key
            row_value = []
            for values in unique_sorted_values:
                row_value.append([' '.join(list(values))])
            # Remove accents from keys before writing
            #d_print(f"{index}-{row_value}")
            # Join the values using '|' as separator, and flatten the nested list
            flattened_values = [item[0] for item in row_value]  # Extract strings from inner lists
            writer.writerow([normalize_key(key)] + [';'.join(flattened_values)])

save_word_index_to_csv(word_index_by_codes)

In [None]:
import sys
import csv

def load_codes_index_from_csv():
    file_path = f'{PATH}/word_index_by_codes.csv'
    word_index = defaultdict(list)
    csv.field_size_limit(sys.maxsize)
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        for row in reader:
            code = row[0]
            values = row[1].split(';')
            list_values = []
            for value in values:
                list_values.append(value.split(' '))
            word_index[code] = list_values
    return word_index


# Function to query the dictionary
def query_string_for_codes(query):
    word_index_by_codes = globals().get('word_index_by_codes')
    if word_index_by_codes is None:
        word_index_by_codes = load_lemma_index_from_csv()
        d_print("Loaded word index:", len(word_index_by_codes))

    query_doc = nlp(query)
    token_results = []
    for token in query_doc:
        if not token.is_stop and token.is_alpha:
            lemma = token.lemma_.lower()
            text = token.text.lower()
            for code, list_of_lemmas in word_index_by_codes.items():
                for lemmas in list_of_lemmas:
                    if normalize_key(normalize_token(token, lemma)) in lemmas:
                        token_results.append(code)
                        #d_print(f"Word '{lemma}' <- {token} found in the index -> {code}")
                    elif lemma in lemmas:
                        token_results.append(code)
                        #d_print(f"Word '{lemma}' <- {token} found in the index -> {code}")
                    elif text in lemmas:
                        token_results.append(code)
                        #d_print(f"Word '{token.text}' found in the index -> {code}")
                    else:
                        # Collect synonyms in Spanish
                        synonyms = set()
                        for syn in wordnet.synsets(lemma, lang='spa'):
                            for l in syn.lemmas('spa'):
                                synonyms.add(l.name().lower())

                        #d_print(synonyms)

                        for syn in synonyms:
                            syn = normalize_key(normalize_token(token, lemma))
                            if syn in lemmas:
                                token_results.append(code)
                                d_print(f"Word '{lemma}' found as {syn} -> {code}")
                                break
    # Collect all unique codes from token results
    unique_codes = set(token_results)

    # Calculate match percentages for each code
    code_matches = {}
    for code in unique_codes:
        lemmas_for_code = word_index_by_codes[code]
        # Count how many words from the query match with lemmas in this code
        for lemmas in lemmas_for_code:
            matches = 0
            normalized_lemmas = []
            for token in query_doc:
                # Skip stop words and non-alphabetic tokens
                if not token.is_stop and token.is_alpha:
                    # Normalize the token's lemma
                    lemma = token.lemma_.lower()
                    text = token.text.lower()
                    normalized = normalize_key(normalize_token(token, lemma))
                    normalized_lemmas.append(normalized)

            # Create sets for each collection of lemmas
            query_lemma_set = set(normalized_lemmas) - {''}
            code_lemma_set = set(lemmas) - {''}

            # Calculate intersection
            for elem in query_lemma_set:
                for elem2 in code_lemma_set:
                    if elem in elem2 or elem2 in elem:
                        matches += 1

            if len(query_lemma_set) == 0 or len(code_lemma_set) == 0 or matches == 0:
                continue # Continue with the next item, as this one doesn't match

            percentage_on_query = matches / len(query_lemma_set) * 100
            percentage_on_code = matches / len(code_lemma_set) * 100
            #import pdb; pdb.set_trace()
            percentage = (percentage_on_query + percentage_on_code) / 2

            if percentage_on_query > 50 or percentage_on_code > 50:
                d_print(f"CODE: {code} - {percentage}% of match for QUERY:{query_lemma_set} and CODE:{code_lemma_set}")
            if code not in code_matches:
                code_matches[code] = percentage
            else:
                if code_matches[code] < percentage:
                    code_matches[code] = percentage

    # Sort codes by match percentage
    return sorted(code_matches.items(), key=lambda x: x[1], reverse=True)

query = "fractura de cadera"
#query = "osteoarticular"
#query = "infeccion por arbovirus"
#arbovirus
#metatarsiano
print("Common codes for the query:", query_string_for_codes(query)[0:10])


CODE: M84.459 - 83.33333333333333% of match for QUERY:{'cadera', 'fractura'} and CODE:{'patologico', 'cadera', 'fractura'}
CODE: M84.459A - 70.0% of match for QUERY:{'cadera', 'fractura'} and CODE:{'contacto', 'cadera', 'patologico', 'inicial', 'fractura'}
CODE: M97.01 - 62.5% of match for QUERY:{'cadera', 'fractura'} and CODE:{'cadera', 'periprotesico', 'protesis', 'interno', 'derecho', 'torno', 'fractura', 'articular'}
CODE: M97.0 - 64.28571428571428% of match for QUERY:{'cadera', 'fractura'} and CODE:{'cadera', 'periprotesico', 'protesis', 'interno', 'torno', 'fractura', 'articular'}
CODE: M84.359P - 64.28571428571428% of match for QUERY:{'cadera', 'fractura'} and CODE:{'contacto', 'cadera', 'union', 'estres', 'defectuoso', 'sucesivo', 'fractura'}
CODE: M84.559P - 61.111111111111114% of match for QUERY:{'cadera', 'fractura'} and CODE:{'enfermedad', 'contacto', 'cadera', 'union', 'patologico', 'defectuoso', 'sucesivo', 'fractura', 'neoplasico'}
CODE: M84.459K - 64.28571428571428% of 

# GET KEYWORDS on the text

In [10]:
test_sentence = """Describimos el caso de un varón de 37 años con vida previa activa que refiere dolores osteoarticulares de localización variable en el último mes y fiebre en la última semana con picos (matutino y vespertino) de 40 C las últimas 24-48 horas, por lo que acude al Servicio de Urgencias. Antes de comenzar el cuadro estuvo en Extremadura en una región endémica de brucella, ingiriendo leche de cabra sin pasteurizar y queso de dicho ganado. Entre los comensales aparecieron varios casos de brucelosis. Durante el ingreso para estudio del síndrome febril con antecedentes epidemiológicos de posible exposición a Brucella presenta un cuadro de orquiepididimitis derecha.
La exploración física revela: Tª 40,2 C; T.A: 109/68 mmHg; Fc: 105 lpm. Se encuentra consciente, orientado, sudoroso, eupneico, con buen estado de nutrición e hidratación. En cabeza y cuello no se palpan adenopatías, ni bocio ni ingurgitación de vena yugular, con pulsos carotídeos simétricos. Auscultación cardíaca rítmica, sin soplos, roces ni extratonos. Auscultación pulmonar con conservación del murmullo vesicular. Abdomen blando, depresible, sin masas ni megalias. En la exploración neurológica no se detectan signos meníngeos ni datos de focalidad. Extremidades sin varices ni edemas. Pulsos periféricos presentes y simétricos. En la exploración urológica se aprecia el teste derecho aumentado de tamaño, no adherido a piel, con zonas de fluctuación e intensamente doloroso a la palpación, con pérdida del límite epidídimo-testicular y transiluminación positiva.
Los datos analíticos muestran los siguentes resultados: Hemograma: Hb 13,7 g/dl; leucocitos 14.610/mm3 (neutrófilos 77%); plaquetas 206.000/ mm3. VSG: 40 mm 1ª hora. Coagulación: TQ 87%; TTPA 25,8 seg. Bioquímica: Glucosa 117 mg/dl; urea 29 mg/dl; creatinina 0,9 mg/dl; sodio 136 mEq/l; potasio 3,6 mEq/l; GOT 11 U/l; GPT 24 U/l; GGT 34 U/l; fosfatasa alcalina 136 U/l; calcio 8,3 mg/dl. Orina: sedimento normal.
Durante el ingreso se solicitan Hemocultivos: positivo para Brucella y Serologías específicas para Brucella: Rosa de Bengala +++; Test de Coombs > 1/1280; Brucellacapt > 1/5120. Las pruebas de imagen solicitadas ( Rx tórax, Ecografía abdominal, TAC craneal, Ecocardiograma transtorácico) no evidencian patología significativa, excepto la Ecografía testicular, que muestra engrosamiento de la bolsa escrotal con pequeña cantidad de líquido con septos y testículo aumentado de tamaño con pequeñas zonas hipoecoicas en su interior que pueden representar microabscesos.
Con el diagnóstico de orquiepididimitis secundaria a Brucella se instaura tratamiento sintomático (antitérmicos, antiinflamatorios, reposo y elevación testicular) así como tratamiento antibiótico específico: Doxiciclina 100 mg vía oral cada 12 horas (durante 6 semanas) y Estreptomicina 1 gramo intramuscular cada 24 horas (durante 3 semanas). El paciente mejora significativamente de su cuadro tras una semana de ingreso, decidiéndose el alta a su domicilio donde completó la pauta de tratamiento antibiótico. En revisiones sucesivas en consultas se constató la completa remisión del cuadro."""

word_index_by_lemmas = load_lemma_index_from_csv()

def get_keywords_from_text(sentence):
    # Process the sentence with spaCy
    doc = nlp(sentence)
    found_keys = set()

    # Process each token in the sentence
    for token in doc:
        if not token.is_stop and token.is_alpha:
            lemma = token.lemma_.lower()
            text = token.text.lower()

            # Check normalized versions
            normalized_lemma = normalize_key(normalize_token(token, lemma))

            # Check for matches in different forms
            if normalized_lemma in word_index_by_lemmas:
                found_keys.add(normalized_lemma)
            elif lemma in word_index_by_lemmas:
                found_keys.add(lemma)
            elif text in word_index_by_lemmas:
                found_keys.add(text)
            else:
                # Check synonyms
                for syn in wordnet.synsets(lemma, lang='spa'):
                    for l in syn.lemmas('spa'):
                        syn_word = normalize_key(l.name().lower())
                        #Fix me
                        if syn_word in word_index_by_lemmas:
                            found_keys.add(syn_word)

    # Return sorted list of found keys
    return sorted(found_keys)

# Example usage:
keywords = get_keywords_from_text(test_sentence)

# Group keywords by proximity in text
threshold = 3  # Words threshold for grouping
grouped_keywords = []

sentences = test_sentence.split('.')
for sentence in sentences:
    current_group = []
    last_index = -1
    if sentence.strip():  # Process only non-empty sentences
        doc = nlp(sentence.strip())
        for token in doc:
            # d_print debug information about the match
            lemma = normalize_key(normalize_token(token, token.lemma_.lower()))
            text = token.text.lower()
            matched_keyword = None

            if lemma in keywords:
                matched_keyword = lemma
            elif text in keywords:
                matched_keyword = text

            if matched_keyword:
                d_print(f"Match found: lemma='{lemma}', text='{text}', matched_keyword='{matched_keyword}' -> {word_index_by_lemmas[matched_keyword]}")
                if last_index == -1 or token.i - last_index <= threshold:
                    current_group.append((token.text, token.i))
                else:
                    if current_group:
                        grouped_keywords.append(current_group)
                    current_group = [(token.text, token.i)]
                last_index = token.i

        if current_group:
            grouped_keywords.append(current_group)

print("\nGrouped keywords by proximity:")
for group in grouped_keywords:
    group_words = [word for word, idx in group]
    group_indices = [idx for word, idx in group]
    print(f"Position {min(group_indices)}-{max(group_indices)}: {', '.join(group_words)}")
print("Found keywords:", keywords)

Match found: lemma='caso', text='caso', matched_keyword='caso' -> {'Z39.0', 'R69'}
Match found: lemma='varon', text='varón', matched_keyword='varon' -> {'F52.0', 'Y07.432'}
Match found: lemma='ano', text='años', matched_keyword='ano' -> {'K62.89', 'T18.5X', 'S30.867S', 'S31.831S', 'D48.5', 'S30.827', 'T21.35', 'K62.82', 'A56.3', 'R40.236', 'S30.3XXS', 'R40.233', 'T18.5', 'D22.5', 'S30.827A', 'A18.32', 'S31.839S', 'T18.5XXD', 'C21.8', 'A54.6', 'C43.51', 'S30.867D', 'S31.835A', 'A50.5', 'S30.3XXD', 'K62.81', 'S30.877', 'S31.839', 'T18.5XX', 'S30.817', 'T21.25', 'S30.817S', 'A50.0', 'S31.832', 'S30.857S', 'S30.877S', 'Z52.811', 'K62.4', 'Q43.5', 'C21', 'S30.857D', 'S30.867A', 'O09.6', 'R85.615', 'A50.6', 'R85.81', 'S31.832A', 'S30.827S', 'D03.51', 'T18.5XXS', 'K62', 'T21.75', 'T21.05', 'R85.61', 'D23.5', 'S31.839D', 'S31.831A', 'S30.827D', 'S30.98XD', 'S31.832D', 'K62.9', 'R40.225', 'R40.222', 'S31.839A', 'Z85.048', 'R85.614', 'R85.618', 'S31.834', 'C4A.51', 'S30.857A', 'S31.834D', 'A52.8

In [11]:
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
import multiprocessing as mp

# Process each group of keywords
# Create a lock for synchronized d_printing and appending to the list
lock = Lock()
results = []
def process_group(group):
    # Join the words in the group
    group_words = [word for word, idx in group]
    query = ' '.join(group_words)

    # Get matches using existing function
    matches = query_string_for_codes(query, word_index_by_codes)

    # Format results string
    print_lines = []
    print_lines.append(f"Analyzing group: {query}")
    print_lines.append("Top 10 matching codes:")
    for code, percentage in matches[:10]:
        print_lines.append(f"Query: {query} - Code: {code} - Match: {percentage:.2f}% - {df_pandas[df_pandas['code'] == code]['description'].iloc[0]}")

    # d_print results in a thread-safe way
    with lock:
        results.append(matches)[:10]
        print('\n'.join(print_lines))

# Process groups in parallel using thread pool
n_cores = mp.cpu_count()
with ThreadPoolExecutor(max_workers=n_cores) as executor:
    executor.map(process_group, grouped_keywords)

print("Final results:")
print(results)

#n44.8-inflamacion testicular
#z20.818-Contacto y (sospecha de) exposición a otras enfermedades transmisibles bacterianas
#r60.9 edema no especificado'
#r52 dolor no especificado'
#a23.9 brucelosis no especificada'
#i83.90 Venas varicosas asintomáticas de extremidad inferior no especificada
#i87.8 Otros trastornos especificados de venas
#r50.9-fiebre
#n45.3-Orquiepididimitis
#m25.50-Dolor en articulación no especificada

Final results:
[]


# ML -> NER

In [12]:
sentence = "Describimos el caso de un varón de 37 años con vida previa activa que refiere dolores osteoarticulares de localización variable en el último mes y fiebre en la última semana con picos (matutino y vespertino) de 40 C las últimas 24-48 horas, por lo que acude al Servicio de Urgencias. Antes de comenzar el cuadro estuvo en Extremadura en una región endémica de brucella, ingiriendo leche de cabra sin pasteurizar y queso de dicho ganado. Entre los comensales aparecieron varios casos de brucelosis. Durante el ingreso para estudio del síndrome febril con antecedentes epidemiológicos de posible exposición a Brucella presenta un cuadro de orquiepididimitis derecha."

from transformers import RobertaForTokenClassification, AutoTokenizer, pipeline
import re

TOKENIZATION_REGEX = re.compile(
    r'([0-9A-Za-zÀ-ÖØ-öø-ÿ]+|[^0-9A-Za-zÀ-ÖØ-öø-ÿ])')

MODEL_PATH = "PlanTL-GOB-ES/bsc-bio-es"
model = RobertaForTokenClassification.from_pretrained(MODEL_PATH)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

tokens = [t for t in TOKENIZATION_REGEX.split(sentence) if t and not t.isspace()]

pipe = pipeline("token-classification", model=MODEL_PATH, aggregation_strategy='first')

sentence_pretokenized = ' '.join(tokens)

def get_added_spaces(sentence, sentence_pretokenized):
    i = j = 0
    added_spaces = []
    while j < len(sentence_pretokenized):
        a = sentence[i]
        b = sentence_pretokenized[j]
        c = sentence_pretokenized[min(len(sentence_pretokenized)-1,j+1)]
        if  a == b:
            i += 1
            j += 1
        elif a == c and b == ' ':
            added_spaces.append(j)
            j += 1
        else:
            raise AssertionError("This should never be called.")
    return added_spaces

added_spaces = get_added_spaces(sentence, sentence_pretokenized)
results_pre = pipe(sentence_pretokenized)

def align_results(results_pre, added_spaces):
    aligned_results = []
    for entity in results_pre:
        aligned_entity = entity.copy()
        num_added_spaces = len(list(filter(lambda offset: offset < entity['start'], added_spaces)))
        aligned_entity['word'] = entity['word'].strip()
        aligned_entity['start'] = entity['start'] - num_added_spaces
        aligned_entity['end'] = entity['end'] - num_added_spaces
        aligned_results.append(aligned_entity)
    return aligned_results


aligned_results = align_results(results_pre, added_spaces)
aligned_results


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/bsc-bio-es and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/bsc-bio-es and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


[{'entity_group': 'LABEL_0',
  'score': np.float32(0.5661028),
  'word': 'Describimos',
  'start': 0,
  'end': 11},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.50304914),
  'word': 'el caso',
  'start': 12,
  'end': 19},
 {'entity_group': 'LABEL_0',
  'score': np.float32(0.5299948),
  'word': 'de un varón de 37 años con',
  'start': 20,
  'end': 46},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.5191083),
  'word': 'vida',
  'start': 47,
  'end': 51},
 {'entity_group': 'LABEL_0',
  'score': np.float32(0.54015374),
  'word': 'previa activa que refiere dolores osteoarticulares de localización variable',
  'start': 52,
  'end': 127},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.5110326),
  'word': 'en el',
  'start': 128,
  'end': 133},
 {'entity_group': 'LABEL_0',
  'score': np.float32(0.5391337),
  'word': 'último mes y fiebre en la última semana con picos ( matutino',
  'start': 134,
  'end': 194},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.5048084),
 