### Loading the model checkpoint using fairseq extensions

In [1]:
import torch
from pathlib import Path

In [2]:
qa_model_checkpoint_path = Path.cwd().joinpath(
    'models', 'QA-PubMedQA-BioGPT', "checkpoint.pt")

In [3]:
torch_model = torch.load(qa_model_checkpoint_path)

In [4]:
torch_model.keys()

dict_keys(['args', 'cfg', 'model', 'criterion', 'optimizer_history', 'task_state', 'extra_state', 'last_optimizer_state'])

In [5]:
torch_config = torch_model["cfg"]

In [6]:
torch_config.keys()

dict_keys(['_name', 'common', 'common_eval', 'distributed_training', 'dataset', 'optimization', 'checkpoint', 'bmuf', 'generation', 'eval_lm', 'interactive', 'model', 'task', 'criterion', 'optimizer', 'lr_scheduler', 'scoring', 'bpe', 'tokenizer', 'ema'])

In [7]:
torch_config.get("task")

{'_name': 'language_modeling_prompt',
 'data': '../../data/PubMedQA/pqal_cased_qcl_ansis-bin',
 'sample_break_mode': 'none',
 'tokens_per_sample': 1024,
 'output_dictionary_size': -1,
 'self_target': False,
 'future_target': False,
 'past_target': False,
 'add_bos_token': False,
 'max_target_positions': 1024,
 'shorten_method': 'none',
 'shorten_data_split_list': '',
 'pad_to_fixed_length': False,
 'pad_to_fixed_bsz': False,
 'seed': 1,
 'batch_size': None,
 'batch_size_valid': None,
 'dataset_impl': None,
 'data_buffer_size': 10,
 'tpu': False,
 'use_plasma_view': False,
 'plasma_path': '/tmp/plasma',
 'source_lang': None,
 'target_lang': None,
 'max_source_positions': 900,
 'manual_prompt': None,
 'learned_prompt': 9,
 'learned_prompt_pattern': 'learned',
 'prefix': False,
 'sep_token': '<seqsep>'}

After looking at the architecture, let us load the model directly using fairseq and later we will decide to load it using transformers and may be transform it to huggingface.

In [8]:
qa_model_checkpoint_path

PosixPath('/Users/esp.py/Projects/Personal/end-to-end-rag/models/QA-PubMedQA-BioGPT/checkpoint.pt')

In [9]:
from biogpt_model.transformer_lm_prompt import TransformerLanguageModelPrompt

INFO:fairseq.tasks.text_to_speech:Please install tensorboardX: pip install tensorboardX


In [10]:
data_path = Path.cwd().joinpath('datasets', 'biogpt', 'pqal_qcl_ansis-bin')
bpe_code_path = data_path.parent.joinpath('raw', 'bpecodes')
assert data_path.exists()
assert bpe_code_path.exists()

In [11]:
model_fairseq = TransformerLanguageModelPrompt.from_pretrained(
    qa_model_checkpoint_path.parent, 
    "checkpoint.pt",
    data_path.__str__(),
    tokenizer="moses",
    bpe="fastbpe",
    bpe_codes=bpe_code_path.__str__(),
    )

INFO:fairseq.file_utils:loading archive file /Users/esp.py/Projects/Personal/end-to-end-rag/models/QA-PubMedQA-BioGPT from cache at /Users/esp.py/Projects/Personal/end-to-end-rag/models/QA-PubMedQA-BioGPT
INFO:fairseq.file_utils:loading archive file /Users/esp.py/Projects/Personal/end-to-end-rag/datasets/biogpt/pqal_qcl_ansis-bin


INFO:biogpt_model.language_modeling_prompt:dictionary: 42384 types
INFO:fairseq.models.fairseq_model:{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '../../src', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 

In [12]:
model_fairseq.cfg.get("generation")

{'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}

In [13]:
from pprint import pprint

In [14]:
contexts = [
    'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
    'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoTracker Red CMXRos and examined. Mitochondrial dynamics were delineated into four categories (M1-M4) based on characteristics including distribution, motility, and membrane potential (ΔΨm). A TUNEL assay showed fragmented nDNA in a gradient over these mitochondrial stages. Chloroplasts and transvacuolar strands were also examined using live cell imaging. The possible importance of mitochondrial permeability transition pore (PTP) formation during PCD was indirectly examined via in vivo cyclosporine A (CsA) treatment. This treatment resulted in lace plant leaves with a significantly lower number of perforations compared to controls, and that displayed mitochondrial dynamics similar to that of non-PCD cells.'
]

In [15]:
contexts[0]

'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.'

In [16]:
question = "Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?"

In [27]:
prompt = f"Question: {question} Context: { ' '.join(contexts)}"

In [28]:
pprint(prompt)

('Question: Do mitochondria play a role in remodelling lace plant leaves '
 'during programmed cell death? Context: Programmed cell death (PCD) is the '
 'regulated death of cells within an organism. The lace plant (Aponogeton '
 'madagascariensis) produces perforations in its leaves through PCD. The '
 'leaves of the plant consist of a latticework of longitudinal and transverse '
 'veins enclosing areoles. PCD occurs in the cells at the center of these '
 'areoles and progresses outwards, stopping approximately five cells from the '
 'vasculature. The role of mitochondria during PCD has been recognized in '
 'animals; however, it has been less studied during PCD in plants. The '
 'following paper elucidates the role of mitochondrial dynamics during '
 'developmentally regulated PCD in vivo in A. madagascariensis. A single '
 'areole within a window stage leaf (PCD is occurring) was divided into three '
 'areas based on the progression of PCD; cells that will not undergo PCD '
 '(NPCD)

In [29]:
source_tokens  = model_fairseq.encode(question)

In [30]:
from transformers import set_seed

In [31]:
set_seed(42)

In [32]:
source_tokens

tensor([10964,  2754,   882,    14,   151,    10, 13101,  8199,   707,  1041,
         3436,    70,  7295,    49,   589,   927,     2])

In [33]:
generated_text = model_fairseq.generate([source_tokens],
                                        beam=5,
                                        min_len=100,
                                        max_len_a=512,
                                        max_len_b=1024,
                                        temperature=0.25,
                                        sampling_topk=50,
                                        sampling_topp=0.95,
                                        sampling=True,)[0]

INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr = False
INFO:fairseq.tasks.fairseq_task:reuse_dataloader = True
INFO:fairseq.tasks.fairseq_task:rebuild_batches = False
INFO:fairseq.tasks.fairseq_task:creating new batches for epoch 1


In [34]:
model_fairseq.max_positions

1024

In [35]:
len(source_tokens)

17

In [36]:
generated_text[0]['tokens']

tensor([10964,  2754,   882,    14,   151,    10, 13101,  8199,   707,  1041,
         3436,    70,  7295,    49,   589,   927, 42384, 42385, 42386, 42387,
        42388, 42389, 42390, 42391, 42392,     6,  9412,    13,     6,  2969,
          690,     6,  1544,    21, 22700,     4,  1544,    20, 23082,  7262,
           49,   589,    12, 17928,    11,    21,     6,  1134,   589,     5,
           42,   192,    32,  4175,     4,    18,  8199,   707,  1041,    12,
         4410,   896,  2432, 37687,   676,  1753, 34842,  5202,  5428,    11,
         3862, 19982,    10,   106,  3436,   199, 17928,     4,    18,  3436,
            5,     6,  1041,  9109,     5,    14,  7653,  3752,   768,   488,
            5,  2265,     8,  5786,  5662,   987, 11985,  6348, 11658,     4,
        17928,  1395,    10,     6,    42,    34,     6,  1594,     5,    55,
         6348, 11658,     8, 18252, 21640, 12095,     7, 16387,   792,   508,
           42,    29,     6,  8136,     4,    18,   151,     5, 

In [37]:
output_text = model_fairseq.decode(generated_text[0]['tokens'])

In [38]:
pprint(output_text)

('Do mitochondria play a role in remodelling lace plant leaves during '
 'programmed cell death? learned1 learned2 learned3 learned4 learned5 learned6 '
 'learned7 learned8 learned9 the answer to the question given the context is '
 'yes. context: Programmed cell death (PCD) is the regulated death of cells '
 'within an organism. The lace plant (Aponogeton madagascariensis) produces '
 'perforations in its leaves through PCD. The leaves of the plant consist of a '
 'latticework of longitudinal and transverse veins enclosing areoles. PCD '
 'occurs in the cells at the center of these areoles and progresses outwards, '
 'stopping approximately five cells from the vasculature. The role of '
 'mitochondria during PCD has been recognized in animals; however, it has been '
 'less studied during PCD in plants. The following paper elucidates the role '
 'of mitochondrial dynamics during developmentally regulated PCD in vivo in A. '
 'madagascariensis. A single areole within a window stage leaf

In [39]:
pprint(output_text)

('Do mitochondria play a role in remodelling lace plant leaves during '
 'programmed cell death? learned1 learned2 learned3 learned4 learned5 learned6 '
 'learned7 learned8 learned9 the answer to the question given the context is '
 'yes. context: Programmed cell death (PCD) is the regulated death of cells '
 'within an organism. The lace plant (Aponogeton madagascariensis) produces '
 'perforations in its leaves through PCD. The leaves of the plant consist of a '
 'latticework of longitudinal and transverse veins enclosing areoles. PCD '
 'occurs in the cells at the center of these areoles and progresses outwards, '
 'stopping approximately five cells from the vasculature. The role of '
 'mitochondria during PCD has been recognized in animals; however, it has been '
 'less studied during PCD in plants. The following paper elucidates the role '
 'of mitochondrial dynamics during developmentally regulated PCD in vivo in A. '
 'madagascariensis. A single areole within a window stage leaf

The prompt model seems to be working but always returning learned. What am I missing here?


Not sure why the model generated this type of data, that may be because of the data. But it worth checking what went wrong with the model.

### Uploading the model to hugginface transformer

In [35]:
model_fairseq.state_dict().keys()

odict_keys(['_float_tensor', 'models.0.decoder.version', 'models.0.decoder.embed_tokens.weight', 'models.0.decoder.embed_positions.weight', 'models.0.decoder.layers.0.self_attn.k_proj.weight', 'models.0.decoder.layers.0.self_attn.k_proj.bias', 'models.0.decoder.layers.0.self_attn.v_proj.weight', 'models.0.decoder.layers.0.self_attn.v_proj.bias', 'models.0.decoder.layers.0.self_attn.q_proj.weight', 'models.0.decoder.layers.0.self_attn.q_proj.bias', 'models.0.decoder.layers.0.self_attn.out_proj.weight', 'models.0.decoder.layers.0.self_attn.out_proj.bias', 'models.0.decoder.layers.0.self_attn_layer_norm.weight', 'models.0.decoder.layers.0.self_attn_layer_norm.bias', 'models.0.decoder.layers.0.fc1.weight', 'models.0.decoder.layers.0.fc1.bias', 'models.0.decoder.layers.0.fc2.weight', 'models.0.decoder.layers.0.fc2.bias', 'models.0.decoder.layers.0.final_layer_norm.weight', 'models.0.decoder.layers.0.final_layer_norm.bias', 'models.0.decoder.layers.1.self_attn.k_proj.weight', 'models.0.decod

This is the model architecture, the next step is to convert the architecture to the huggingface

Will start here tommorow!

https://github.com/huggingface/transformers/blob/main/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py

### Model Conversion to HF

In [40]:
from pathlib import Path

In [41]:
from convert_biogpt_original_pytorch_checkpoint_to_pytorch import convert_biogpt_checkpoint_to_pytorch

In [42]:
model_path = Path.cwd().joinpath('datasets', 'biogpt', 'raw')

In [43]:
model_path

PosixPath('/Users/esp.py/Projects/Personal/end-to-end-rag/datasets/biogpt/raw')

In [44]:
biogpt_qa_hf_path = Path.cwd().joinpath('models', 'bio-gpt-qa')

In [45]:
model_fairseq.state_dict().keys()

odict_keys(['_float_tensor', 'models.0.decoder.version', 'models.0.decoder.embed_tokens.weight', 'models.0.decoder.embed_positions.weight', 'models.0.decoder.layers.0.self_attn.k_proj.weight', 'models.0.decoder.layers.0.self_attn.k_proj.bias', 'models.0.decoder.layers.0.self_attn.v_proj.weight', 'models.0.decoder.layers.0.self_attn.v_proj.bias', 'models.0.decoder.layers.0.self_attn.q_proj.weight', 'models.0.decoder.layers.0.self_attn.q_proj.bias', 'models.0.decoder.layers.0.self_attn.out_proj.weight', 'models.0.decoder.layers.0.self_attn.out_proj.bias', 'models.0.decoder.layers.0.self_attn_layer_norm.weight', 'models.0.decoder.layers.0.self_attn_layer_norm.bias', 'models.0.decoder.layers.0.fc1.weight', 'models.0.decoder.layers.0.fc1.bias', 'models.0.decoder.layers.0.fc2.weight', 'models.0.decoder.layers.0.fc2.bias', 'models.0.decoder.layers.0.final_layer_norm.weight', 'models.0.decoder.layers.0.final_layer_norm.bias', 'models.0.decoder.layers.1.self_attn.k_proj.weight', 'models.0.decod

In [46]:
# disable for running magic command
%%script --no-raise-error
convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path=model_path, pytorch_dump_folder_path=biogpt_qa_hf_path)

UsageError: Line magic function `%%script` not found.


Now we have a problem, the fairseq  model has a vocabulary size  of 42384 while the model embedding layers has a size of 42393 words. It looks like in the embedding layers we have added  9 words which are learned1, learned2, learned3.... and learned9. Those words aret he words that the model is always generating before putting the final answer.

Let us see how the model will perform

In [47]:
from transformers import BioGptForCausalLM, BioGptTokenizer

In [48]:
tokenizer = BioGptTokenizer.from_pretrained(biogpt_qa_hf_path)
bio_gpt_model = BioGptForCausalLM.from_pretrained(biogpt_qa_hf_path)

In [49]:
prompt

'Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death? Context: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in l

In [50]:
tokenized_text = tokenizer.encode(prompt, return_tensors="pt")

In [51]:
tokenized_text

tensor([[    2,  4790, 20925,    20, 10964,  2754,   882,    14,   151,    10,
         13101,  8199,   707,  1041,  3436,    70,  7295,    49,   589,   927,
         22343,    20, 23082,  7262,    49,   589,    12, 17928,    11,    21,
             6,  1134,   589,     5,    42,   192,    32,  4175,     4,    18,
          8199,   707,  1041,    12,  4410,   896,  2432, 37687,   676,  1753,
         34842,  5202,  5428,    11,  3862, 19982,    10,   106,  3436,   199,
         17928,     4,    18,  3436,     5,     6,  1041,  9109,     5,    14,
          7653,  3752,   768,   488,     5,  2265,     8,  5786,  5662,   987,
         11985,  6348, 11658,     4, 17928,  1395,    10,     6,    42,    34,
             6,  1594,     5,    55,  6348, 11658,     8, 18252, 21640, 12095,
             7, 16387,   792,   508,    42,    29,     6,  8136,     4,    18,
           151,     5,  2754,    70, 17928,    57,    58,  1903,    10,   546,
            44,   523,     7,   114,    57,    58,  

In [55]:
generate_tokens = bio_gpt_model.generate(tokenized_text, 
                                         num_beams=5, 
                                         do_sample=True,
                                         top_k=50,
                                         top_p=0.95,
                                         max_length=512)

In [84]:
pprint(test_input)

("question: 'Do mitochondria play a role in remodelling lace plant leaves "
 'during programmed cell death? Contexts: Programmed cell death (PCD) is the '
 'regulated death of cells within an organism. The lace plant (Aponogeton '
 'madagascariensis) produces perforations in its leaves through PCD. The '
 'leaves of the plant consist of a latticework of longitudinal and transverse '
 'veins enclosing areoles. PCD occurs in the cells at the center of these '
 'areoles and progresses outwards, stopping approximately five cells from the '
 'vasculature. The role of mitochondria during PCD has been recognized in '
 'animals; however, it has been less studied during PCD in plants. The '
 'following paper elucidates the role of mitochondrial dynamics during '
 'developmentally regulated PCD in vivo in A. madagascariensis. A single '
 'areole within a window stage leaf (PCD is occurring) was divided into three '
 'areas based on the progression of PCD; cells that will not undergo PCD '
 '(NPC

In [57]:
pprint(tokenizer.decode(generate_tokens[0], skip_special_tokens=True))

('Question: Do mitochondria play a role in remodelling lace plant leaves '
 'during programmed cell death? Context: Programmed cell death (PCD) is the '
 'regulated death of cells within an organism. The lace plant (Aponogeton '
 'madagascariensis) produces perforations in its leaves through PCD. The '
 'leaves of the plant consist of a latticework of longitudinal and transverse '
 'veins enclosing areoles. PCD occurs in the cells at the center of these '
 'areoles and progresses outwards, stopping approximately five cells from the '
 'vasculature. The role of mitochondria during PCD has been recognized in '
 'animals; however, it has been less studied during PCD in plants. The '
 'following paper elucidates the role of mitochondrial dynamics during '
 'developmentally regulated PCD in vivo in A. madagascariensis. A single '
 'areole within a window stage leaf (PCD is occurring) was divided into three '
 'areas based on the progression of PCD; cells that will not undergo PCD '
 '(NPCD)

In [56]:
bio_gpt_model.push_to_hub("BioGPT-Large-QA-PubMedQA")

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/repos/create HTTP/1.1" 200 105
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/whoami-v2 HTTP/1.1" 200 674
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/models/espoir/BioGPT-Large-QA-PubMedQA/preupload/main HTTP/1.1" 200 225
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /espoir/BioGPT-Large-QA-PubMedQA.git/info/lfs/objects/batch HTTP/1.1" 200 60072
pytorch_model.bin:   0%|          | 0.00/1.39G [00:00<?, ?B/s]DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com:443
pytorch_model.bin:   1%|          | 16.0M/1.39G [00:21<28:10, 811kB/s]  DEBUG:urllib3.connectionpool:https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com:443 "PUT /repos/03/f8/03f84fe850e23eced8dca753e7f18653664c787f64c8434c53ef86f9ccaedeaa/d5da0c1725ab61225ad6

CommitInfo(commit_url='https://huggingface.co/espoir/BioGPT-Large-QA-PubMedQA/commit/299fec15968219eb87e176f1226958de393eb641', commit_message='Upload BioGptForCausalLM', commit_description='', oid='299fec15968219eb87e176f1226958de393eb641', pr_url=None, pr_revision=None, pr_num=None)

This si where we stop today, I will comeback tommorow to learn why the prompt is working.

In [58]:
tokenizer.push_to_hub("BioGPT-Large-QA-PubMedQA")

DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/repos/create HTTP/1.1" 409 110
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/whoami-v2 HTTP/1.1" 200 674
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/models/espoir/BioGPT-Large-QA-PubMedQA/preupload/main HTTP/1.1" 200 299
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "POST /api/models/espoir/BioGPT-Large-QA-PubMedQA/commit/main HTTP/1.1" 200 204


CommitInfo(commit_url='https://huggingface.co/espoir/BioGPT-Large-QA-PubMedQA/commit/63a37b629f3aff02d2a6e15194136877aa149423', commit_message='Upload tokenizer', commit_description='', oid='63a37b629f3aff02d2a6e15194136877aa149423', pr_url=None, pr_revision=None, pr_num=None)