In [1]:
!pip install -q transformers datasets accelerate


In [2]:
import torch
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    pipeline
)


In [3]:
medical_text = [
    "Healthcare systems rely on accurate diagnosis to provide effective treatment.",
    "Early detection of diseases significantly improves patient outcomes.",
    "Medical imaging techniques such as MRI and CT scans assist doctors in diagnosis.",
    "Preventive medicine focuses on reducing the risk of illness before symptoms appear.",
    "Artificial intelligence is increasingly used to support clinical decision making.",
    "Electronic health records improve the efficiency of patient data management.",
    "Public health initiatives aim to control the spread of infectious diseases.",
    "Clinical trials are essential for evaluating the safety of new medications.",
    "Telemedicine enables patients to consult doctors remotely using digital platforms.",
    "Vaccination programs help protect populations from serious illnesses.",
    "Personalized medicine tailors treatment plans based on individual patient data.",
    "Medical professionals follow ethical guidelines to ensure patient safety.",
    "Health education encourages individuals to adopt healthier lifestyles.",
    "Chronic diseases require long-term monitoring and continuous care.",
    "Advancements in biotechnology have transformed modern healthcare.",
    "Accurate medical documentation is critical for continuity of care.",
    "Mental health awareness is an important aspect of overall wellbeing.",
    "Hospital management systems improve coordination between departments.",
    "Medical research contributes to evidence-based healthcare practices.",
    "Patient-centered care emphasizes respect and communication in treatment."
]

with open("medical_train.txt", "w") as f:
    for sentence in medical_text:
        f.write(sentence + "\n")


In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 does not include padding token
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [5]:
with open("medical_train.txt", "r") as f:
    data = f.readlines()

def tokenize_text(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=64,
        return_tensors="pt"
    )

tokenized_data = tokenize_text(data)


In [6]:
class GPT2Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings["input_ids"].size(0)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.encodings["input_ids"][idx]
        }

train_dataset = GPT2Dataset(tokenized_data)


In [7]:
training_args = TrainingArguments(
    output_dir="gpt2_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    logging_steps=50,
    save_steps=200,
    save_total_limit=1,
    fp16=True,
    report_to="none"
)


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=30, training_loss=3.4685740152994793, metrics={'train_runtime': 25.9272, 'train_samples_per_second': 2.314, 'train_steps_per_second': 1.157, 'total_flos': 489922560000.0, 'train_loss': 3.4685740152994793, 'epoch': 3.0})

In [9]:
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")


('fine_tuned_gpt2/tokenizer_config.json',
 'fine_tuned_gpt2/special_tokens_map.json',
 'fine_tuned_gpt2/vocab.json',
 'fine_tuned_gpt2/merges.txt',
 'fine_tuned_gpt2/added_tokens.json')

In [10]:
text_generator = pipeline(
    "text-generation",
    model="fine_tuned_gpt2",
    tokenizer="fine_tuned_gpt2"
)

prompt = "Artificial intelligence"

generated_text = text_generator(
    prompt,
    max_length=60,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

print(generated_text[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Artificial intelligence programs are increasingly being used to help manage the health of individuals.


In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_gpt2")
model = GPT2LMHeadModel.from_pretrained("fine_tuned_gpt2")



In [13]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)


Device set to use cuda:0


In [14]:
prompt = "Medical research has shown that"


In [15]:
result = generator(
    prompt,
    max_length=80,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

print("INPUT PROMPT:")
print(prompt)

print("\nGENERATED OUTPUT:")
print(result[0]["generated_text"])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


INPUT PROMPT:
Medical research has shown that

GENERATED OUTPUT:
Medical research has shown that individuals with low testosterone levels are more vulnerable to developing prostate cancer.


In [16]:
prompts = [
    "Healthcare systems are evolving",
    "Artificial intelligence in medicine",
    "Patient-centered care focuses on"
]

for p in prompts:
    out = generator(p, max_length=60)
    print("\nPROMPT:", p)
    print("OUTPUT:", out[0]["generated_text"])


Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



PROMPT: Healthcare systems are evolving
OUTPUT: Healthcare systems are evolving faster and more efficiently.


Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



PROMPT: Artificial intelligence in medicine
OUTPUT: Artificial intelligence in medicine is a real possibility, and hopefully the next phase of artificial intelligence will enable more people to find cures.

-

PROMPT: Patient-centered care focuses on
OUTPUT: Patient-centered care focuses on helping patients manage their symptoms with care.


In [17]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import math


In [18]:
model = GPT2LMHeadModel.from_pretrained("fine_tuned_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_gpt2")

model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [19]:
with open("medical_train.txt", "r") as f:
    texts = f.readlines()


In [20]:
def calculate_perplexity(model, tokenizer, texts):
    total_loss = 0
    total_tokens = 0

    for text in texts:
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss

        total_loss += loss.item() * inputs["input_ids"].size(1)
        total_tokens += inputs["input_ids"].size(1)

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity


In [21]:
ppl = calculate_perplexity(model, tokenizer, texts)
print("Model Perplexity:", round(ppl, 2))


Model Perplexity: 14.25


In [22]:
base_model = GPT2LMHeadModel.from_pretrained("gpt2")
base_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

base_ppl = calculate_perplexity(base_model, base_tokenizer, texts)
print("Base GPT-2 Perplexity:", round(base_ppl, 2))
print("Fine-tuned GPT-2 Perplexity:", round(ppl, 2))


Base GPT-2 Perplexity: 57.22
Fine-tuned GPT-2 Perplexity: 14.25
