# Necessary Links
---
[Dataset Description](https://github.com/mila-iqia/ddxplus)

# Imports
---

In [2]:
import pandas as pd

# Downloads
---

In [None]:
!wget 'https://huggingface.co/datasets/aai530-group6/ddxplus/resolve/main/release_evidences.json'

In [None]:
!wget 'https://huggingface.co/datasets/aai530-group6/ddxplus/resolve/main/validate.csv'

# Preprocessing the Dataset suitable for bioBERT
---
{"text": "Question: Characterize your pain. Answer: Burning.", "label": ["Acute Laryngitis"]}


In [None]:
val_set = pd.read_csv('validate.csv', nrows=10)
val_set

In [None]:
import json
import ast

# Load data from JSON files
with open("patient_data.json", "r") as patient_file:
    patient_data = json.load(patient_file)

with open("questionnaire.json", "r") as questionnaire_file:
    questionnaire = json.load(questionnaire_file)

# Parse evidences from the patient data
evidences = ast.literal_eval(patient_data["EVIDENCES"])  # Safely parse the evidence list
parsed_evidences = []

# Match evidence with the questionnaire
for evidence in evidences:
    if "_@_" in evidence:
        code, value = evidence.split("_@_")
        if code in questionnaire:
            question = questionnaire[code].get("question_en", "Unknown question")
            value_meaning = questionnaire[code].get("value_meaning", {}).get(value, "Unknown")
            parsed_evidences.append(f"{question} {value_meaning}.")
    else:
        parsed_evidences.append(f"Evidence code: {evidence}.")

# Combine all parsed information into a BioBERT-compatible input format
bioBERT_input = {
    "AGE": patient_data["AGE"],
    "SEX": patient_data["SEX"],
    "DIFFERENTIAL_DIAGNOSIS": ast.literal_eval(patient_data["DIFFERENTIAL_DIAGNOSIS"]),
    "TEXT": " ".join(parsed_evidences),
    "LABEL": patient_data["PATHOLOGY"]
}

# Save the data to a TSV file for BioBERT
output_file = "bioBERT_input.tsv"
with open(output_file, "w") as f:
    f.write("TEXT\tLABEL\n")
    f.write(f"{bioBERT_input['TEXT']}\t{bioBERT_input['LABEL']}\n")

# Print the processed BioBERT input for verification
print("Processed BioBERT input:")
print(json.dumps(bioBERT_input, indent=4))


# Fine Tuning bioBERT
---

## TRAIN

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1", num_labels=num_conditions
)

# Tokenize input data
inputs = tokenizer(["Question: Characterize your pain. Answer: Exhausting."],
                   padding=True, truncation=True, return_tensors="pt")

# Train the model (simplified)
outputs = model(**inputs, labels=torch.tensor([label]))
loss = outputs.loss


# PREDICT

In [None]:
inputs = tokenizer(["Question: Characterize your pain. Answer: Burning."],
                   padding=True, truncation=True, return_tensors="pt")
predictions = model(**inputs).logits
predicted_label = torch.argmax(predictions, dim=1)