### Loading the model checkpoint using fairseq extensions

In [1]:
import torch
from pathlib import Path

In [2]:
qa_model_checkpoint_path = Path.cwd().joinpath(
    'models', 'QA-PubMedQA-BioGPT', "checkpoint.pt")

After looking at the architecture, let us load the model directly using fairseq and later we will decide to load it using transformers and may be transform it to huggingface.

In [3]:
qa_model_checkpoint_path

PosixPath('/Users/esp.py/Projects/Personal/end-to-end-rag/models/QA-PubMedQA-BioGPT/checkpoint.pt')

In [4]:
from biogpt_model.transformer_lm_prompt import TransformerLanguageModelPrompt

2024-02-17 05:33:50 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [5]:
data_path = Path.cwd().joinpath('datasets', 'biogpt', 'pqal_qcl_ansis-bin')
bpe_code_path = data_path.parent.joinpath('raw', 'bpecodes')
assert data_path.exists()
assert bpe_code_path.exists()

In [6]:
model_fairseq = TransformerLanguageModelPrompt.from_pretrained(
    qa_model_checkpoint_path.parent, 
    "checkpoint.pt",
    data_path.__str__(),
    tokenizer="moses",
    bpe="fastbpe",
    bpe_codes=bpe_code_path.__str__(),
    )

2024-02-17 05:33:53 | INFO | fairseq.file_utils | loading archive file /Users/esp.py/Projects/Personal/end-to-end-rag/models/QA-PubMedQA-BioGPT from cache at /Users/esp.py/Projects/Personal/end-to-end-rag/models/QA-PubMedQA-BioGPT
2024-02-17 05:33:53 | INFO | fairseq.file_utils | loading archive file /Users/esp.py/Projects/Personal/end-to-end-rag/datasets/biogpt/pqal_qcl_ansis-bin


2024-02-17 05:33:55 | INFO | biogpt_model.language_modeling_prompt | dictionary: 42384 types
2024-02-17 05:33:58 | INFO | fairseq.models.fairseq_model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '../../src', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, '

In [7]:
model_fairseq.cfg.get("generation")

{'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}

In [8]:
from pprint import pprint

In [9]:
contexts = [
    'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
    'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoTracker Red CMXRos and examined. Mitochondrial dynamics were delineated into four categories (M1-M4) based on characteristics including distribution, motility, and membrane potential (ΔΨm). A TUNEL assay showed fragmented nDNA in a gradient over these mitochondrial stages. Chloroplasts and transvacuolar strands were also examined using live cell imaging. The possible importance of mitochondrial permeability transition pore (PTP) formation during PCD was indirectly examined via in vivo cyclosporine A (CsA) treatment. This treatment resulted in lace plant leaves with a significantly lower number of perforations compared to controls, and that displayed mitochondrial dynamics similar to that of non-PCD cells.'
]

In [10]:
contexts[0]

'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.'

In [11]:
question = "Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?"

In [45]:
prompt = f"question: {question} context: { ' '.join(contexts)}"

In [46]:
pprint(prompt)

('question: Do mitochondria play a role in remodelling lace plant leaves '
 'during programmed cell death? context: Programmed cell death (PCD) is the '
 'regulated death of cells within an organism. The lace plant (Aponogeton '
 'madagascariensis) produces perforations in its leaves through PCD. The '
 'leaves of the plant consist of a latticework of longitudinal and transverse '
 'veins enclosing areoles. PCD occurs in the cells at the center of these '
 'areoles and progresses outwards, stopping approximately five cells from the '
 'vasculature. The role of mitochondria during PCD has been recognized in '
 'animals; however, it has been less studied during PCD in plants. The '
 'following paper elucidates the role of mitochondrial dynamics during '
 'developmentally regulated PCD in vivo in A. madagascariensis. A single '
 'areole within a window stage leaf (PCD is occurring) was divided into three '
 'areas based on the progression of PCD; cells that will not undergo PCD '
 '(NPCD)

In [47]:
source_tokens  = model_fairseq.encode(question)

In [48]:
from transformers import set_seed

In [49]:
set_seed(42)

In [50]:
source_tokens

tensor([10964,  2754,   882,    14,   151,    10, 13101,  8199,   707,  1041,
         3436,    70,  7295,    49,   589,   927,     2])

In [51]:
generated_text = model_fairseq.generate([source_tokens],
                                        beam=5,
                                        min_len=100,
                                        max_len_a=512,
                                        max_len_b=512,
                                        temperature=0.25,
                                        sampling_topk=50,
                                        sampling_topp=0.95,
                                        sampling=True,)[0]

In [52]:
model_fairseq.max_positions

1024

In [53]:
len(source_tokens)

17

In [54]:
generated_text[0]['tokens']

tensor([10964,  2754,   882,    14,   151,    10, 13101,  8199,   707,  1041,
         3436,    70,  7295,    49,   589,   927, 42384, 42385, 42386, 42387,
        42388, 42389, 42390, 42391, 42392,     6,  9412,    13,     6,  2969,
          690,     6,  1544,    21, 22700,     4,  1544,    20, 23082,  7262,
           49,   589,    12, 17928,    11,    21,     6,  1134,   589,     5,
           42,   192,    32,  4175,     4,    18,  8199,   707,  1041,    12,
         4410,   896,  2432, 37687,   676,  1753, 34842,  5202,  5428,    11,
         3862, 19982,    10,   106,  3436,   199, 17928,     4,    18,  3436,
            5,     6,  1041,  9109,     5,    14,  7653,  3752,   768,   488,
            5,  2265,     8,  5786,  5662,   987, 11985,  6348, 11658,     4,
        17928,  1395,    10,     6,    42,    34,     6,  1594,     5,    55,
         6348, 11658,     8, 18252, 21640, 12095,     7, 16387,   792,   508,
           42,    29,     6,  8136,     4,    18,   151,     5, 

In [55]:
output_text = model_fairseq.decode(generated_text[0]['tokens'])

In [56]:
pprint(output_text)

('Do mitochondria play a role in remodelling lace plant leaves during '
 'programmed cell death? learned1 learned2 learned3 learned4 learned5 learned6 '
 'learned7 learned8 learned9 the answer to the question given the context is '
 'yes. context: Programmed cell death (PCD) is the regulated death of cells '
 'within an organism. The lace plant (Aponogeton madagascariensis) produces '
 'perforations in its leaves through PCD. The leaves of the plant consist of a '
 'latticework of longitudinal and transverse veins enclosing areoles. PCD '
 'occurs in the cells at the center of these areoles and progresses outwards, '
 'stopping approximately five cells from the vasculature. The role of '
 'mitochondria during PCD has been recognized in animals; however, it has been '
 'less studied during PCD in plants. The following paper elucidates the role '
 'of mitochondrial dynamics during developmentally regulated PCD in vivo in A. '
 'madagascariensis. A single areole within a window stage leaf

In [57]:
pprint(output_text)

('Do mitochondria play a role in remodelling lace plant leaves during '
 'programmed cell death? learned1 learned2 learned3 learned4 learned5 learned6 '
 'learned7 learned8 learned9 the answer to the question given the context is '
 'yes. context: Programmed cell death (PCD) is the regulated death of cells '
 'within an organism. The lace plant (Aponogeton madagascariensis) produces '
 'perforations in its leaves through PCD. The leaves of the plant consist of a '
 'latticework of longitudinal and transverse veins enclosing areoles. PCD '
 'occurs in the cells at the center of these areoles and progresses outwards, '
 'stopping approximately five cells from the vasculature. The role of '
 'mitochondria during PCD has been recognized in animals; however, it has been '
 'less studied during PCD in plants. The following paper elucidates the role '
 'of mitochondrial dynamics during developmentally regulated PCD in vivo in A. '
 'madagascariensis. A single areole within a window stage leaf

The prompt model seems to be working but always returning learned. What am I missing here?


Not sure why the model generated this type of data, that may be because of the data. But it worth checking what went wrong with the model.

### Uploading the model to hugginface transformer

In [31]:
model_fairseq.state_dict().keys()

odict_keys(['_float_tensor', 'models.0.decoder.version', 'models.0.decoder.embed_tokens.weight', 'models.0.decoder.embed_positions.weight', 'models.0.decoder.layers.0.self_attn.k_proj.weight', 'models.0.decoder.layers.0.self_attn.k_proj.bias', 'models.0.decoder.layers.0.self_attn.v_proj.weight', 'models.0.decoder.layers.0.self_attn.v_proj.bias', 'models.0.decoder.layers.0.self_attn.q_proj.weight', 'models.0.decoder.layers.0.self_attn.q_proj.bias', 'models.0.decoder.layers.0.self_attn.out_proj.weight', 'models.0.decoder.layers.0.self_attn.out_proj.bias', 'models.0.decoder.layers.0.self_attn_layer_norm.weight', 'models.0.decoder.layers.0.self_attn_layer_norm.bias', 'models.0.decoder.layers.0.fc1.weight', 'models.0.decoder.layers.0.fc1.bias', 'models.0.decoder.layers.0.fc2.weight', 'models.0.decoder.layers.0.fc2.bias', 'models.0.decoder.layers.0.final_layer_norm.weight', 'models.0.decoder.layers.0.final_layer_norm.bias', 'models.0.decoder.layers.1.self_attn.k_proj.weight', 'models.0.decod

This is the model architecture, the next step is to convert the architecture to the huggingface

Will start here tommorow!

https://github.com/huggingface/transformers/blob/main/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py

### Model Conversion to HF

In [32]:
from pathlib import Path

In [33]:
from convert_biogpt_original_pytorch_checkpoint_to_pytorch import convert_biogpt_checkpoint_to_pytorch

In [34]:
model_path = Path.cwd().joinpath('datasets', 'biogpt', 'raw')

In [35]:
assert model_path.exists(), "Model path does not exist"

In [36]:
biogpt_qa_hf_path = Path.cwd().joinpath('models', 'bio-gpt-qa')

###### Convert me to python cell to execute
convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path=model_path, 
                                     pytorch_dump_folder_path=biogpt_qa_hf_path)

Now we have a problem, the fairseq  model has a vocabulary size  of 42384 while the model embedding layers has a size of 42393 words. It looks like in the embedding layers we have added  9 words which are learned1, learned2, learned3.... and learned9. Those words aret he words that the model is always generating before putting the final answer.

Let us see how the model will perform

In [37]:
from transformers import BioGptForCausalLM, BioGptTokenizer

In [38]:
tokenizer = BioGptTokenizer.from_pretrained(biogpt_qa_hf_path)
bio_gpt_model = BioGptForCausalLM.from_pretrained(biogpt_qa_hf_path)

In [39]:
prompt

'Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death? Context: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in l

In [40]:
tokenized_text = tokenizer.encode(prompt, return_tensors="pt")

In [41]:
tokenized_text

tensor([[    2,  4790, 20925,    20, 10964,  2754,   882,    14,   151,    10,
         13101,  8199,   707,  1041,  3436,    70,  7295,    49,   589,   927,
         22343,    20, 23082,  7262,    49,   589,    12, 17928,    11,    21,
             6,  1134,   589,     5,    42,   192,    32,  4175,     4,    18,
          8199,   707,  1041,    12,  4410,   896,  2432, 37687,   676,  1753,
         34842,  5202,  5428,    11,  3862, 19982,    10,   106,  3436,   199,
         17928,     4,    18,  3436,     5,     6,  1041,  9109,     5,    14,
          7653,  3752,   768,   488,     5,  2265,     8,  5786,  5662,   987,
         11985,  6348, 11658,     4, 17928,  1395,    10,     6,    42,    34,
             6,  1594,     5,    55,  6348, 11658,     8, 18252, 21640, 12095,
             7, 16387,   792,   508,    42,    29,     6,  8136,     4,    18,
           151,     5,  2754,    70, 17928,    57,    58,  1903,    10,   546,
            44,   523,     7,   114,    57,    58,  

In [42]:
pprint(prompt)

('Question: Do mitochondria play a role in remodelling lace plant leaves '
 'during programmed cell death? Context: Programmed cell death (PCD) is the '
 'regulated death of cells within an organism. The lace plant (Aponogeton '
 'madagascariensis) produces perforations in its leaves through PCD. The '
 'leaves of the plant consist of a latticework of longitudinal and transverse '
 'veins enclosing areoles. PCD occurs in the cells at the center of these '
 'areoles and progresses outwards, stopping approximately five cells from the '
 'vasculature. The role of mitochondria during PCD has been recognized in '
 'animals; however, it has been less studied during PCD in plants. The '
 'following paper elucidates the role of mitochondrial dynamics during '
 'developmentally regulated PCD in vivo in A. madagascariensis. A single '
 'areole within a window stage leaf (PCD is occurring) was divided into three '
 'areas based on the progression of PCD; cells that will not undergo PCD '
 '(NPCD)

In [43]:
generate_tokens = bio_gpt_model.generate(tokenized_text, 
                                         num_beams=5, 
                                         do_sample=True,
                                         top_k=50,
                                         top_p=0.95,
                                         max_length=512)

In [44]:
pprint(tokenizer.decode(generate_tokens[0], skip_special_tokens=True))

('Question: Do mitochondria play a role in remodelling lace plant leaves '
 'during programmed cell death? Context: Programmed cell death (PCD) is the '
 'regulated death of cells within an organism. The lace plant (Aponogeton '
 'madagascariensis) produces perforations in its leaves through PCD. The '
 'leaves of the plant consist of a latticework of longitudinal and transverse '
 'veins enclosing areoles. PCD occurs in the cells at the center of these '
 'areoles and progresses outwards, stopping approximately five cells from the '
 'vasculature. The role of mitochondria during PCD has been recognized in '
 'animals; however, it has been less studied during PCD in plants. The '
 'following paper elucidates the role of mitochondrial dynamics during '
 'developmentally regulated PCD in vivo in A. madagascariensis. A single '
 'areole within a window stage leaf (PCD is occurring) was divided into three '
 'areas based on the progression of PCD; cells that will not undergo PCD '
 '(NPCD)

In [39]:
bio_gpt_model.push_to_hub("BioGPT-Large-QA-PubMedQA")

pytorch_model.bin: 100%|██████████| 1.39G/1.39G [32:16<00:00, 717kB/s]   


CommitInfo(commit_url='https://huggingface.co/espoir/BioGPT-Large-QA-PubMedQA/commit/a016a8329297428945c49b302c0338c5244fc950', commit_message='Upload BioGptForCausalLM', commit_description='', oid='a016a8329297428945c49b302c0338c5244fc950', pr_url=None, pr_revision=None, pr_num=None)

This si where we stop today, I will comeback tommorow to learn why the prompt is working.

In [41]:
tokenizer.push_to_hub("BioGPT-Large-QA-PubMedQA")

CommitInfo(commit_url='https://huggingface.co/espoir/BioGPT-Large-QA-PubMedQA/commit/c7be40dc77fa11d996b3699d91893ae9511e3700', commit_message='Upload tokenizer', commit_description='', oid='c7be40dc77fa11d996b3699d91893ae9511e3700', pr_url=None, pr_revision=None, pr_num=None)