## Data Preparation

#### Installs

In [1]:
!pip install transformers -q



#### Imports

In [2]:
import time
import torch
import random
import numpy as np
import pandas as pd

from torch.optim import AdamW

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from torch.utils.data import SequentialSampler

from sklearn.model_selection import train_test_split

from transformers import GPT2LMHeadModel
from transformers import GPT2Config
from transformers import AutoTokenizer
from transformers import get_linear_schedule_with_warmup


In [3]:
RANDOM_STATE = 2023
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x7f3606ac1e58>

#### Parameters

In [4]:
device = torch.device("cpu")

test_size = 0.2
valid_size = 0.5

pretrained_path = 'distilgpt2'

bos_token = '<|startoftext|>'
eos_token = '<|endoftext|>'
pad_token = '<|pad|>'

batch_size = 2
max_length = 128

epochs = 10
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

trials = 50

#### Classes

In [5]:
class GPT2Dataset(Dataset):
    def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768, bos_token='<|startoftext|>', eos_token='<|endoftext|>'):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for txt in txt_list:
            encodings_dict = tokenizer(bos_token + txt + eos_token, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

#### Data Load

In [6]:
df = pd.read_csv('mtsamples.csv', index_col=0).dropna().reset_index(drop=True)
df.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [7]:
labels = list(df['medical_specialty'].drop_duplicates().dropna().values.ravel())
print(labels)
df = df['medical_specialty'] + ' | ' + df['description']

[' Allergy / Immunology', ' Bariatrics', ' Cardiovascular / Pulmonary', ' Dentistry', ' Urology', ' General Medicine', ' Surgery', ' Speech - Language', ' SOAP / Chart / Progress Notes', ' Sleep Medicine', ' Rheumatology', ' Radiology', ' Psychiatry / Psychology', ' Podiatry', ' Physical Medicine - Rehab', ' Pediatrics - Neonatal', ' Pain Management', ' Orthopedic', ' Ophthalmology', ' Office Notes', ' Obstetrics / Gynecology', ' Neurosurgery', ' Neurology', ' Nephrology', ' Letters', ' Lab Medicine - Pathology', ' IME-QME-Work Comp etc.', ' Hospice - Palliative Care', ' Hematology - Oncology', ' Gastroenterology', ' ENT - Otolaryngology', ' Endocrinology', ' Emergency Room Reports', ' Discharge Summary', ' Diets and Nutritions', ' Dermatology', ' Cosmetic / Plastic Surgery', ' Consult - History and Phy.', ' Chiropractic']


#### Data Processing

In [9]:
ids = list(df.index.drop_duplicates().values.ravel())
id_train, id_test = train_test_split(ids, test_size=test_size, shuffle=True, random_state=RANDOM_STATE)
id_test, id_valid = train_test_split(id_test, test_size=valid_size, shuffle=True, random_state=RANDOM_STATE)

df_train = df[df.index.isin(id_train)]
df_valid = df[df.index.isin(id_valid)]
df_test = df[df.index.isin(id_test)]

print(len(df_train))
print(len(df_valid))
print(len(df_test))

3118
390
390


#### Create Dataset

In [10]:
config = GPT2Config.from_pretrained(pretrained_path, output_hidden_states=False)
tokenizer = AutoTokenizer.from_pretrained(pretrained_path, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
dataset_train = GPT2Dataset(df_train, tokenizer, max_length=max_length)
dataset_valid = GPT2Dataset(df_valid, tokenizer, max_length=max_length)
dataset_test = GPT2Dataset(df_test, tokenizer, max_length=max_length)

dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_valid = DataLoader(dataset_valid, sampler=RandomSampler(dataset_valid), batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=batch_size)

#### Save Dataset

In [12]:
torch.save(dataloader_train, './data_train.bin')
torch.save(dataloader_valid, './data_valid.bin')
torch.save(dataloader_test, './data_test.bin')

## Model Training

#### Imports

#### Parameters

#### Classes

#### Load Dataset

In [13]:
dataloader_train = torch.load('./data_train.bin')
dataloader_valid = torch.load('./data_valid.bin')
dataloader_test = torch.load('./data_test.bin')

#### Model Create

In [14]:
model = GPT2LMHeadModel.from_pretrained(pretrained_path, config=config)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

In [15]:
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)
total_steps = len(dataloader_train) * epochs 
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
print(total_steps)

15590


#### Model Train

In [None]:
for epoch_i in range(epochs):
    model.train()
    loss_total = 0.
    
    for step, batch in enumerate(dataloader_train):
        model.zero_grad() 
        
        ids_batch = batch[0].to(device)
        labels_batch = batch[0].to(device)
        mask_batch = batch[1].to(device)
        
        output = model(ids_batch, labels=labels_batch, attention_mask=mask_batch)
        loss = output[0]
        loss_batch = loss.item()
        loss_total += loss_batch
        
        if (step % sample_every) == 0:
            model.eval()
            prompt_embedding = torch.tensor(tokenizer.encode(random.choice(labels) + ' | ')).unsqueeze(0).to(device)
            generated_sample = model.generate(
                prompt_embedding,
                pad_token_id=50256,
                do_sample=True,   
                top_k=50, 
                max_length=128,
                top_p=0.99, 
                num_return_sequences=3
            )
            for example in generated_sample:
                generated_sample = tokenizer.decode(example, skip_special_tokens=True)
                print(generated_sample)
            print()
            model.train()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    avg_loss = loss_total / len(dataloader_train)
print(f'Epochs:{epoch_i+1}; TotLoss:{loss_total};')

[2023-01-18 23:12:51.941 1-8-1-cpu-py36-ml-t3-medium-05a4a7868130c7575335c53b16c7:12743 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-01-18 23:12:52.044 1-8-1-cpu-py36-ml-t3-medium-05a4a7868130c7575335c53b16c7:12743 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
 Letters | 
 Letters | 
 Letters | 

 Hematology - Oncology |  Progress in patient.

 Hematology - Oncology |  Colonoscopy | Bilateral cotterisomy of the lung chamber.  The patient is a very close and non-nervous bladder and an apparent hemoglobin deficiency.  Oncology | C4  Hematogenic C6 with his thorax, but for the right hemoglobin deficiency.
 Hematology - Oncology |  Ophthalmology and   Left End Endoscopy |  Endoscopy |  Right Endoscopy |   Left Endoscope of left anterior medial medial medial medial occipital anterior extremity of endoscopy.

 Letters |  Left upper lobe lobe lobe lobotomy.  Closed mid lobe lobe lobe and anterior lob

#### Model Save

In [None]:
#model.save_pretrained("20230118_distilgpt2_medical_generator/")
#tokenizer.save_pretrained("20230118_distilgpt2_medical_generator/")

#### Model Load

In [9]:
tokenizer = AutoTokenizer.from_pretrained("20230118_distilgpt2_medical_generator")
config = GPT2Config.from_pretrained("20230118_distilgpt2_medical_generator")
model = GPT2LMHeadModel.from_pretrained("20230118_distilgpt2_medical_generator", config=config)

## Evaluate

#### Imports

#### Parameters

#### Generate Examples

In [18]:
model.eval()
generated_samples = []
generated_labels = []

for lbl in labels:
    for _ in range(trials):
        prompt_start = lbl + ' | '
        prompt = torch.tensor(tokenizer.encode(prompt_start)).unsqueeze(0)
        prompt.to(device)
        samples = model.generate(
            prompt,
            do_sample=True,
            pad_token_id=50256,
            top_k=75,
            max_length=max_length,
            top_p=.99,
            num_return_sequences=1
        )
        samples = [tokenizer.decode(x, skip_special_tokens=True).replace(prompt_start, '') for x in samples]
        generated_samples.extend(samples)
    generated_labels.extend([lbl for _ in range(trials)])
df_generated = pd.DataFrame({'medical_specialty':generated_labels, 'description':generated_samples})
df_generated

Unnamed: 0,medical_specialty,description
0,Allergy / Immunology,This is a 14-month-old baby boy Caucasian who...
1,Allergy / Immunology,"Chronic glossitis, xerostomia, probable envir..."
2,Allergy / Immunology,Sample/template for a normal female multisyst...
3,Allergy / Immunology,"Chronic glossitis, xerostomia, probable envir..."
4,Allergy / Immunology,Nissen fundoplication. A 2 cm midline incisi...
...,...,...
1945,Chiropractic,MRI head without contrast.
1946,Chiropractic,Sample Radiology report of knee (growth arres...
1947,Chiropractic,Left shoulder pain. Evaluate for rotator cuff...
1948,Chiropractic,Right upper quadrant pain. Nuclear medicine ...


#### Save Generations

In [19]:
df_generated.drop_duplicates().to_csv('./mtsamples_generated.csv', sep=',')