In [216]:
!pip install transformers
!pip install sacremoses



In [217]:
import json
import torch
from transformers import BertTokenizer, BertModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer
from transformers import BioGptTokenizer, BioGptForCausalLM


In [218]:
# Load pre-trained model tokenizer (vocabulary)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
bioGPT_tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
bioGPT_model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
clinBERT_tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")

In [219]:
f = open('data.json')
data = json.load(f)

In [220]:
x_key_to_extract = "abstract"

for data_key, data_serial in data.items():
    for level1_key, level1_data in data_serial.items():
        if x_key_to_extract in level1_data:
            abstract = level1_data[x_key_to_extract]
            print(f"{data_key}: {x_key_to_extract}: {abstract}")

1: abstract: {'1': 'Muscle cramps are a common problem characterized by a sudden, painful, involuntary contraction of muscle.', '2': 'These true cramps, which originate from peripheral nerves, may be distinguished from other muscle pain or spasm.', '3': 'Medical history, physical examination, and a limited laboratory screen help to determine the various causes of muscle cramps.', '4': 'Despite the "benign" nature of cramps, many patients find the symptom very uncomfortable.', '5': 'Treatment options are guided both by experience and by a limited number of therapeutic trials.', '6': 'Quinine sulfate is an effective medication, but the side-effect profile is worrisome, and other membrane-stabilizing drugs are probably just as effective.', '7': 'Patients will benefit from further studies to better define the pathophysiology of muscle cramps and to find more effective medications with fewer side-effects.'}
1: abstract: {'1': 'The dystonias are a group of disorders characterized by excessiv

In [221]:
# y_key_to_extract = "adaptations"

# for data_key, data_serial in data.items():
#     for level1_key, level1_data in data_serial.items():
#         if y_key_to_extract in level1_data:
#           adapt_raw = level1_data[y_key_to_extract]
#           print(f"{data_key}: {y_key_to_extract}: {adapt_raw}")

In [222]:
abstract = data["1"]["25432724"]["abstract"]
# adapt = data["1"]["15902691"]["adaptations"]["adaptation2"]

In [223]:
sent1 =  abstract["5"]
# marked_sent = sent1
marked_sent = marked_pair = "[CLS] " + sent1 + " [SEP]"
print (f'Sentence: {marked_sent}, Count : {len(marked_sent)}')

# sent2 = adapt["1"]
# marked_pair = "[CLS] " + sent1 + " [SEP]" + sent2 + " [SEP]"

# # Tokenize our sentence with the BERT tokenizer.
# tokenized_pair = tokenizer.tokenize(marked_pair)

# # Print out the tokens.
# print (tokenized_pair)

Sentence: [CLS] Treatment options include counseling, education, oral medications, botulinum toxin injections, and several surgical procedures. [SEP], Count : 139


In [225]:
# Encode the input using the BioBERT tokenizer with attention mask
# encoded_input = biobert_tokenizer(marked_sent, return_tensors='pt', padding='max_length', truncation=True, max_length=64)

encoded_input = clinBERT_tokenizer(marked_sent, return_tensors='pt', padding='max_length', truncation=True, max_length=64)


# # Encode the input using the BioGPT tokenizer with attention mask
# encoded_input = bioGPT_tokenizer(marked_sent,
#                                  return_tensors='pt',
#                                  padding='max_length',
#                                  truncation=True,
#                                  max_length=64
#                                  )

# Ensure that the pad_token_id is set for GPT-2
# gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

#Try BioGPT, change eos
bioGPT_tokenizer.pad_token = bioGPT_tokenizer.eos_token
bioGPT_tokenizer.padding_side ='left'

print (f'Encoded inputs: {encoded_input}')

Encoded inputs: {'input_ids': tensor([[   101,    101,  21379,  53121,  12363,  11170,  78067,  10230,    117,
          14943,    117,  36965,  10172, 102096,    117,  41960,  78887,  10465,
          10114,  76750,  91879,  10107,    117,  10111,  11736,  10326,  60764,
          70176,    119,    102,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_pair)
# print (indexed_tokens)

# for tup in zip(tokenized_pair, indexed_tokens):
#     print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [228]:
bioGPT_model.resize_token_embeddings(len(clinBERT_tokenizer))
bioGPT_model.config.pad_token_id = bioGPT_model.config.eos_token_id

In [229]:
with torch.no_grad():
    beam_output = bioGPT_model.generate(
                                input_ids= encoded_input['input_ids'],
                                min_length=50,
                                max_length=70,
                                num_beams=5,
                                attention_mask=encoded_input['attention_mask'],
                                early_stopping=True
                                )

generated_text = bioGPT_tokenizer.decode(beam_output[0], skip_special_tokens=True)
# Remove any trailing padding tokens
generated_text = generated_text.rstrip('<pad>')
print (generated_text)

cancer cancer ecule <unk>resveratrol propionate <unk>plantar based corticbased illance mutagenic <unk>based د <unk>piglets ple <unk><unk>EB based undifferentiated lenCarbon <unk><unk>use no no.


In [None]:
# generated_prompt_ids = gpt2_model.generate(
#     max_length=100,  # Adjust max length as needed
#     input_ids=encoded_input['input_ids'],
#     num_return_sequences=1,  # Number of prompts to generate
#     no_repeat_ngram_size=3,  # Avoid repeating n-grams
#     attention_mask=encoded_input['attention_mask'],  # Pass the attention mask
#     top_k=100,  # Sample from top-k most probable words
#     top_p=0.9,  # Sample from top-p cumulative distribution
#     temperature=0  # Control randomness (0.2 for more deterministic, higher for more random)
# )


In [None]:
# # Decode and print the generated prompt
# gpt2_tokenizer.decode(generated_prompt_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)