<a href="https://colab.research.google.com/github/deondrae4088/WebMd_chat/blob/main/medmd_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
#Importing libraries and dependencies needed for the project
import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

#library not covered in class requirement met
import torch
import pandas as pd

In [None]:
#Found a Pretrained Question and Answer model(meta-llama2-tm) in huggingface
model_name = "deepset/roberta-base-squad2"

tokenizer = transformers.AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = transformers.AutoModelForCausalLM.from_pretrained("deepset/roberta-base-squad2")

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#moutn google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#read in the medical condition dataset, questions and answers
medical_df = pd.read_csv('/content/drive/MyDrive/webmd/medical_qa_data.csv')

In [None]:
medical_df.head()

Unnamed: 0,Question Type,Question,Answer
0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."


## BASELINE PRETRAINED MODEL FROM HUGGINFACE

In [None]:
#Test out the roberta-base qa model on simple medical questions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

model_name = "deepset/roberta-base-squad2"
for index, row in medical_df.iterrows():
  context = str(row['Answer'])

  # a) Get predictions
  QA_input = {
    'context': context,
    'question': 'What are the different treaments for cancer'
  }
response = nlp(QA_input)
print(response)

Device set to use cuda:0


{'score': 0.029269898310303688, 'start': 302, 'end': 377, 'answer': 'abdominal pain, abdominal mass and symptoms of gastrointestinal obstruction'}




FINETUNE (OPTIMIZE PRETRAINED MODEL)

In [None]:
medical_df_filtered = medical_df[["Question", "Answer"]]

In [None]:
#Preprcoess medical_df_filtered for finetuning

def preprocess_function(examples):
    questions = [q.strip() for q in examples["Question"]]
    inputs = tokenizer(
        questions,
        examples["Answer"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["Answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = 0
        end_char = start_char + len(answer)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
#split data into training and testing set
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_df, test_df = train_test_split(medical_df_filtered, test_size=0.2)

In [None]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [None]:
train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)

In [None]:
train_dataset_tokenized = train_df.map(preprocess_function, batched=True, remove_columns=['Question', 'Answer', '__index_level_0__'])
test_dataset_tokenized = test_df.map(preprocess_function, batched=True, remove_columns=['Question', 'Answer', '__index_level_0__'])

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

Map:   0%|          | 0/3282 [00:00<?, ? examples/s]

In [None]:
#Train the model
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

data_collator = default_data_collator
training_args = TrainingArguments(
    output_dir="my_medical_qa_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=train_dataset_tokenized,
    processing_class=tokenizer,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdaina92[0m ([33mdaina92-george-washington-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


## TEST THE FINETUNED MODEL

In [None]:
from transformers import AutoTokenizer
question = "What are the different sysmtoms of lung cancer"

tokenizer = AutoTokenizer.from_pretrained("my_medical_qa_model")
inputs = tokenizer(question, context, return_tensors="pt")

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [None]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

## EVALUATION METRICS