# GPTChallenge: diagnóstico a partir de HCE

Vamos a trabajar con el corpus CodEsp (textos de historial clínico etiquetados con sus códigos CIE-10 Diagnóstico)

In [3]:
import pandas as pd
import os, re
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

pd.options.display.max_colwidth = None

In [4]:
#los códigos están en un TSV con un código por línea
train_diag = pd.read_csv("data/train/train.tsv", sep="\t", header=None, names=["archivo", "codigo"])
train_diag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8316 entries, 0 to 8315
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   archivo  8316 non-null   object
 1   codigo   8316 non-null   object
dtypes: object(2)
memory usage: 130.1+ KB


In [5]:
train_diag['codigo'].value_counts()

codigo
r52        163
r69        150
r50.9      142
i10        116
r59.9       95
          ... 
d37.030      1
c79.71       1
g25.0        1
l76.3        1
n81.2        1
Name: count, Length: 2194, dtype: int64

In [6]:
#cogemos la categoría superior de cada código y las agrupamos
train_diag['cat'] = train_diag['codigo'].str.extract(r'(\w\d\d)')
print(train_diag['cat'].value_counts())
train_diag['cat'].nunique()

cat
r52    163
r10    163
r59    160
r69    150
r50    144
      ... 
c31      1
d62      1
s53      1
s34      1
n81      1
Name: count, Length: 918, dtype: int64


918

In [7]:
categories=train_diag['cat'].value_counts()[:10]
top_categorias = categories.index.to_list()
print(top_categorias)

['r52', 'r10', 'r59', 'r69', 'r50', 'r60', 'i10', 'r11', 'n28', 'd49']


In [8]:
type(train_diag['cat'])

pandas.core.series.Series

In [9]:
#seleccionamos sólo las etiquetas de este subconjunto
train_diag = train_diag[np.isin(train_diag['cat'], top_categorias)]

In [10]:
#cargamos los dos conjuntos de train
path = 'data/train/text_files_en/'

corpus = []
for f in [f for f in os.listdir(path) if f.endswith('.txt')]:
    with open(os.path.join(path, f), encoding="utf8") as text:
        texto = text.read()
    #buscamos códigos
    file = f[:-4]
    codigos = train_diag.query('archivo==@file')['cat'].to_list()
    codigos = list(set(codigos))
    if codigos:
        corpus.append({
            'archivo': file,
            'texto': texto,
            'codigos': codigos
        })
    
df_train = pd.DataFrame(corpus).set_index('archivo')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562 entries, S0004-06142005000700014-1 to S2340-98942015000100005-1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   texto    562 non-null    object
 1   codigos  562 non-null    object
dtypes: object(2)
memory usage: 13.2+ KB


In [11]:
df_train.sample(1)

Unnamed: 0_level_0,texto,codigos
archivo,Unnamed: 1_level_1,Unnamed: 2_level_1
S0376-78922011000500012-1,"A 31-year-old male with a history of post-traumatic stenosisectomy 14 years earlier and overweight.\nPneumococcal sepsis caused multiorgan failure (renal, respiratory, hematologic and hemodynamic) and admission to the Intensive Care Unit (ICU).\nThe patient remained intubated for 11 days and then transferred to the Internal Medicine ward.\nDue to poor perfusion secondary to sepsis, necrotic lesions appeared in the upper and mainly lower distal limbs.\nInitially assessed by vascular surgery, treatment with prostaglandins was indicated for 21 days.\nAfter improvement of the general condition and disappearance of vital risk and after 7 weeks admitted to Internal Medicine, he underwent Plastic Surgery for treatment of residual necrotic lesions.\nOn admission to our service she presented multiple small lesions in forearms, hands and thighs that epithelized adequately with topical treatment.\nAs a major complication of left plantar necrosis with 10 importance existed in the plantar mute zone that was debrided in the hospitalization room, left plantar anesthesia, dry necrosis of 3rd, 4th and 5th fingers of the right foot.\n1.\nSurgical intervention for left and right pretibial plantar debridement and daily debridement in the room to reduce most sloughs and debridement was performed.\nAfter 20 days of daily debridement in the patient's room, without local anesthesia (the patient had no pain due to plantar anesthesia and because it was necrotic tissue), V.A.C.® therapy was initiated to avoid a large wound coverage.\nPolyurethane sponges (VAC GranuFoam®) were applied, with continuous negative pressure of 125 mmHg in the first 48 hours and then continued with intermittent negative pressure (5 minutes with aspiration and 2 without aspiration).\nThe first week, tapered sponges (VAC GranuFam Silver®) were used to improve the antibacterial effect given the persistence of slough.\nThe VAC container is abundantly exudated during treatment.\nWe used the VAC ATS® Therapy System.\nComplete treatment lasted 20 days and sponges were changed 3 times a week.\n1.\nThe patient was able to continue performing physiotherapy exercises during treatment, comfortably transporting his VAC® system in his hand, when he attended wheeled sessions.\nThe evolution was very favorable, with disappearance of residual sloughs, formation of granulation tissue of live red color, without appearance of superinfection and obtaining a suitable bed to receive a skin graft.\n1.\nOne week after VAC® therapy was withdrawn, the patient underwent surgery to cover the left plantar defect with a partial thickness skin graft taken from the thigh on the same side, amputation of the 3rd, 4th and 5th residual skin graft.\nThe grafts were properly secured and the wounds healed well.\n1.\nThe patient was discharged after 4 and a half months of hospitalization and 2 weeks after the skin graft coverage intervention.\nUpon discharge, the patient was ambulation with help of a prescription.\nSix weeks after discharge, the patient walked independently with the help of a special template placed by the podiatrist, although he required routine care by the podiatrist due to plantar anesthesia, which required strict monitoring of the skin.\n",[r52]


## Cargar los textos del conjunto de test

In [12]:
#los códigos están en un TSV con un código por línea
test_diag = pd.read_csv("data/test/test.tsv", sep="\t", header=None, names=["archivo"])
test_diag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   archivo  192 non-null    object
dtypes: object(1)
memory usage: 1.6+ KB


In [13]:
path = 'data/test/text_files_en/'

corpus = []
for f in [f for f in os.listdir(path) if f.endswith('.txt')]:
    if os.path.splitext(f)[0] in test_diag['archivo'].values:
        with open(os.path.join(path, f), encoding="utf8") as text:
            texto = text.read()
            corpus.append({
                'archivo': file,
                'texto': texto
            })
    
df_test = pd.DataFrame(corpus).set_index('archivo')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, S2340-98942015000100005-1 to S2340-98942015000100005-1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   texto   192 non-null    object
dtypes: object(1)
memory usage: 3.0+ KB


In [14]:
df_test.sample(1)

Unnamed: 0_level_0,texto
archivo,Unnamed: 1_level_1
S2340-98942015000100005-1,"A 67-year-old man consulted with the ophthalmologist for presenting in the last days pain in the right eye with blurred vision, incessant tear and diplopia in the extreme look.\nThe examination revealed ocular discomfort marked by profusion of the right eye, which presented generalized limitation to mobility, especially in the face of consolidation, and intense swelling of the conjunctiva that prolapsed through the cleft lip and palate.\nAfter performing an orbital CT and ruling out intraorbital involvement, the patient was referred to Internal Medicine.\nHe was a male smoker of 30 cigarettes a day since his youth without any other relevant background.\nIn the anamnesis, besides the ocular symptoms that led to the consultation, she reported having presented some respiratory distress and dysphonia, but without chest pain, hemoptysis, fever or other systemic symptoms.\nOn physical examination her general condition was good and she had no signs of increased respiratory work.\nIn the physical examination, besides the alterations of the right eye, a striking increase in volume of the right side of the face and neck was observed.\nThe right jugular vein remained visible and engorged continuously and at the level of the right temple and anterior thoracic plane dilation of the superficial venous network could be appreciated.\nThe neck fixation showed a hard, adhered and non-painful tumor at the level of the right supraclavicular fossa.\nVital signs, axillary involvement, and pulmonary, cardiac, abdominal and lower limb examination showed no relevant findings.\nComplete blood count, coagulation study, biochemical profile including tumor markers and arterial gas analysis were normal.\nChest X-ray showed mediastinal widening of the right hilum.\nIn the cervicothoracic CT, multiple nodular images were observed in the upper lobe of the right lung, adenopatic conglomerate occupying the anterior mediastinum and with cranial extension to the right lateral neck vein and jugular trunk.\nCranial-orbital CT showed mild right exophthalmos, thickening of the superior ophthalmic vein and absence of orbital tumor.\nA needle aspiration biopsy of the right cervical adenopathic mass was diagnostic of metastasis of small cell carcinoma.\n"


## Binarizar las etiquetas

In [15]:
# para entrenar un clasificador multi-etiqueta generamos una matriz binaria de las etiquetas
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(df_train['codigos'])

#Guardamos las clases utilizadas en el conjunto de train
clases = mlb.classes_
num_classes = clases.shape
print(num_classes[0])

10


## Procesamiento del lenguaje natural

In [53]:
fileName = 'data/data_with_med_embeddings_en.tsv'
if not os.path.isfile(fileName):
    from transformers import AutoTokenizer, AutoModel
    from torch import no_grad

    tokenizer = AutoTokenizer.from_pretrained("Charangan/MedBERT")
    model = AutoModel.from_pretrained("Charangan/MedBERT")

    # Function to tokenize and obtain embeddings
    def get_bert_embeddings(text):
        # Tokenize text
        tokens = tokenizer(text, return_tensors='pt', padding = True, truncation = True, max_length = 100, add_special_tokens = False)
        # Get BERT model output
        with no_grad():
            output = model(**tokens)
        # Extract embeddings for [CLS] token
        embeddings = output.last_hidden_state[:,0,:].squeeze(0).numpy()
        return embeddings

    # Assuming 'df' is your DataFrame and 'text_column' is the name of the column containing the text
    df_train['embedding'] = df_train['texto'].apply(lambda x: get_bert_embeddings(x))

    # Save the DataFrame with the embeddings column
    df_train.to_csv(fileName, index=False, sep = "\t")
else:
    df_train = pd.read_csv(fileName, index_col=False, sep = "\t")
    df_train['codigos'] = df_train['codigos'].apply(lambda s: eval(s))

    def insert_space_before_negative(string):
        return re.sub(r'(\s-)', r' \1', string)

    df_train["embedding"] = df_train["embedding"].apply(lambda x: insert_space_before_negative(x))
    df_train["embedding"] = df_train["embedding"].apply(lambda s: s.replace("\n", ""))
    df_train["embedding"] = df_train["embedding"].apply(lambda s: s.replace("  ", ", "))
    df_train["embedding"] = df_train["embedding"].apply(lambda s: s.replace(", , ", ", "))
    df_train["embedding"] = df_train["embedding"].apply(lambda s: eval(s))

X = [list(x) for x in df_train['embedding'].values]

# Binarizar
mlb = MultiLabelBinarizer() 
y_train = mlb.fit_transform(df_train['codigos'])

#Guardamos las clases utilizadas en el conjunto de train
clases = mlb.classes_
num_classes = clases.shape
print(num_classes[0])

10


## Modelos

In [55]:
x_train, x_test, y_train_train, y_train_test = train_test_split(X, y_train, random_state = 3)

In [56]:
multi_target_classifier = MLPClassifier(activation = 'tanh', solver = 'adam', max_iter = 5000, n_iter_no_change = np.inf, random_state = 3, verbose = 1)
multi_target_classifier.fit(x_train, y_train_train)
y_pred = multi_target_classifier.predict(x_test)
report = classification_report(y_train_test, y_pred, zero_division = 0)
print(report)

Iteration 1, loss = 7.12362547
Iteration 2, loss = 5.07473520
Iteration 3, loss = 4.96046949
Iteration 4, loss = 4.95446883
Iteration 5, loss = 4.83642524
Iteration 6, loss = 4.67613759
Iteration 7, loss = 4.50482763
Iteration 8, loss = 4.36820224
Iteration 9, loss = 4.25609505
Iteration 10, loss = 4.18237970
Iteration 11, loss = 4.12662643
Iteration 12, loss = 4.05373709
Iteration 13, loss = 3.97593527
Iteration 14, loss = 3.90194051
Iteration 15, loss = 3.83637460
Iteration 16, loss = 3.76733785
Iteration 17, loss = 3.70425860
Iteration 18, loss = 3.65307817
Iteration 19, loss = 3.58700234
Iteration 20, loss = 3.52785554
Iteration 21, loss = 3.47496763
Iteration 22, loss = 3.42907173
Iteration 23, loss = 3.37871990
Iteration 24, loss = 3.32662446
Iteration 25, loss = 3.27791104
Iteration 26, loss = 3.23675284
Iteration 27, loss = 3.20485983
Iteration 28, loss = 3.16285272
Iteration 29, loss = 3.10976258
Iteration 30, loss = 3.06084801
Iteration 31, loss = 3.01743762
Iteration 32, los



In [57]:
estimator = GradientBoostingClassifier(loss = 'exponential', criterion = 'friedman_mse', n_estimators = 100, max_depth = 5, min_samples_split = 5, random_state = 3)
base_classifier = AdaBoostClassifier(estimator = estimator, n_estimators = 100, random_state = 3)
multi_target_classifier = MultiOutputClassifier(base_classifier, n_jobs = 4)
multi_target_classifier.fit(x_train, y_train_train)
y_pred = multi_target_classifier.predict(x_test)
report = classification_report(y_train_test, y_pred, zero_division = 0)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.12      0.21        26
           1       0.65      0.31      0.42        35
           2       0.00      0.00      0.00        13
           3       0.62      0.17      0.27        29
           4       0.00      0.00      0.00        23
           5       0.40      0.11      0.17        36
           6       0.45      0.16      0.24        31
           7       0.17      0.04      0.06        25
           8       1.00      0.04      0.08        25
           9       0.50      0.03      0.05        35

   micro avg       0.51      0.11      0.18       278
   macro avg       0.48      0.10      0.15       278
weighted avg       0.51      0.11      0.17       278
 samples avg       0.19      0.11      0.13       278

