In [1]:
%pip install pandas scikit-learn torch transformers datasets matplotlib torchviz
%pip install jupyter

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('../csv_import_scrips/cie10-es-diagnoses.csv', skip_blank_lines=True)
df_train = df_train[['description', 'code']]
df_train = df_train.rename(columns={'description': 'text', 'code': 'label'})

# Load the evaluation dataset
df_eval = pd.read_csv('../generated-diagnoses/diagnosticos_medicos_10000.csv', skip_blank_lines=True)
df_eval = df_eval[['Diagnóstico', 'CIE-10']]
df_eval = df_eval.rename(columns={'Diagnóstico': 'text', 'CIE-10': 'label'})

In [3]:
import re
with open('spanish_stopwords.txt', 'r') as file:
    stop_words = [line.strip() for line in file.readlines()]

def preprocess_text(text):
    # Combine Latin Spanish and Castilian Spanish stop words
    text = text.lower()
    return text

    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'[áàäâã]', 'a', text)
    text = re.sub(r'[éèëê]', 'e', text)
    text = re.sub(r'[íìïî]', 'i', text)
    text = re.sub(r'[óòöôõ]', 'o', text)
    text = re.sub(r'[úùüû]', 'u', text)
    # text = re.sub(r'ñ', 'n', text)
    # añadir codigo para quitar los grados -> 39°
    # text = re.sub(r'\d+°', '', text)

    words = text.split()

    for stop_word in stop_words:
        if stop_word in words:
            words.remove(stop_word)

    for word in words:
        if len(word) < 2:
            words.remove(word)

    if "de" in words:
        words.remove("de")

    return ' '.join(words)

df_train['text'] = df_train['text'].apply(lambda x: preprocess_text(x))
df_train.to_csv('../csv_import_scrips/cie10-es-diagnoses_processed.csv', index=False)
df_train.head()

df_eval['text'] = df_eval['text'].apply(lambda x: preprocess_text(x))
df_eval.to_csv('../generated-diagnoses/diagnosticos_medicos_10000_processed.csv', index=False)
df_eval.head()

Unnamed: 0,text,label
0,mujer de 32 años con disuria y urgencia miccio...,N30.0
1,paciente masculino de 60 años con pérdida prog...,M48.0
2,"niño de 8 años con fiebre persistente de 39°c,...",B05.9
3,"mujer de 29 años con antecedentes de ansiedad,...",F41.0
4,paciente masculino de 65 años con tos crónica ...,J44.9


In [4]:
# Preprocess data
df_train['label'] = df_train['label'].astype('category').cat.codes
df_eval['label'] = df_eval['label'].astype('category').cat.codes
df_train.head(), df_eval.head()

(                            text  label
 0  clamidia psittaci infecciones    557
 1                        tracoma    558
 2       etapa inicial de tracoma    559
 3         fase activa de tracoma    560
 4       tracoma, no especificado    561,
                                                 text  label
 0  mujer de 32 años con disuria y urgencia miccio...     16
 1  paciente masculino de 60 años con pérdida prog...     14
 2  niño de 8 años con fiebre persistente de 39°c,...      1
 3  mujer de 29 años con antecedentes de ansiedad,...      6
 4  paciente masculino de 65 años con tos crónica ...     11)

In [8]:
# Tokenize and encode data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True)

train_dataset = Dataset.from_pandas(df_train)
eval_dataset = Dataset.from_pandas(df_eval)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

ImportError: Called a Tensorflow-specific function but Tensorflow is not installed.

In [None]:
from torchviz import make_dot
from IPython.display import Image

# Check if the model is already trained and saved
model_path = './trained_model'
logging_steps = 1000

if os.path.exists(model_path):
    # Load the saved model and tokenizer
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    print('Model loaded from saved files.')
else:
    # Load pre-trained BERT model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df_train['label'].unique()))

    # Fine-tune BERT model
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=logging_steps,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train model
    trainer.train()

    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    print('Model trained and saved.')

# Define the trainer for 2nd training
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=logging_steps,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_eval_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 2nd train model performance
trainer.train()

# Define the trainer for evaluation
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    #gradient_accumulation_steps=32
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Evaluate model performance
trainer.evaluate()

In [None]:
# Plot training metrics
training_metrics = trainer.state.log_history

losses = [x['loss'] for x in training_metrics if 'loss' in x]
steps = [x['step'] for x in training_metrics if 'loss' in x]
plt.figure(figsize=(10, 5))
plt.plot(steps, losses, label='Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.show()

eval_losses = [x['eval_loss'] for x in training_metrics if 'eval_loss' in x]
eval_steps = [x['step'] for x in training_metrics if 'eval_loss' in x]
plt.figure(figsize=(10, 5))
plt.plot(eval_steps, eval_losses, label='Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Validation Loss')
plt.legend()
plt.show()

In [None]:
# Example queries to the model
label_to_cie10 = {v: k for v, k in df_train[['label', 'text']].drop_duplicates().set_index('label')['text'].to_dict().items()}

# Ensure all labels are in the dictionary
unique_labels = df_train['label'].unique()
for label in unique_labels:
    if label not in label_to_cie10:
        label_to_cie10[label] = 'Unknown'

def predict(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.item()

# Example queries
examples = [
    'apendicitis',
    'diabetes',
    'fractura de tibia',
    'dolor de cabeza',
    'covid-19'
]

for example in examples:
    label = predict(example)
    cie10_code = label_to_cie10.get(label, 'Unknown')
    print(f'Text: {example}\nPredicted Label: {label} - cie10 code: {cie10_code}\n')