# Introduction

This notebook shows how to load a GPT-Megatron model on a single node with 8 V100 GPUs.

https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_eval.py

https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/

## Setting the initial variables

In [3]:
import json
import os
from argparse import ArgumentParser

import torch
from pytorch_lightning.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from nemo.collections.nlp.data.language_modeling.megatron.request_dataset import GPTRequestDataset
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
from nemo.utils import logging
from nemo.utils.app_state import AppState
from nemo.utils.model_utils import inject_model_parallel_rank

[NeMo W 2022-03-23 20:35:38 experimental:27] Module <class 'nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers.MegatronPretrainingRandomBatchSampler'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2022-03-23 20:35:38 __init__:22] `pynini` is not installed ! 
    Please run the `nemo_text_processing/setup.sh` scriptprior to usage of this toolkit.


In [4]:
assert torch.cuda.is_available()

In [15]:
checkpoint_dir = "/5b_checkpoints/checkpoints"
checkpoint_name = "megatron_gpt--val_loss=1.78-step=32121-consumed_samples=46254240.0-last.ckpt"
# I have a node with 8 V100 GPUs so not sure what should I set for these variables
devices = 1
num_nodes = 2
# The below comes from the checkpoint
tensor_model_parallel_size=2
pipeline_model_parallel_size=1
precision=16
hparams_file=None

"devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

In [16]:
!ls -R /5b_checkpoints

/5b_checkpoints:
checkpoints

/5b_checkpoints/checkpoints:
mp_rank_00  mp_rank_01

/5b_checkpoints/checkpoints/mp_rank_00:
'megatron_gpt--val_loss=1.78-step=32121-consumed_samples=46254240.0-last.ckpt'

/5b_checkpoints/checkpoints/mp_rank_01:
'megatron_gpt--val_loss=1.78-step=32121-consumed_samples=46254240.0-last.ckpt'


In [17]:
 trainer = Trainer(
        plugins=[NLPDDPPlugin()],
        devices=devices,
        num_nodes=num_nodes,
        accelerator='gpu',
        precision=precision,
    )

      rank_zero_deprecation(
    
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [18]:
app_state = AppState()
if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
    app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = tensor_model_parallel_size
    app_state.model_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size
    (
        app_state.tensor_model_parallel_rank,
        app_state.pipeline_model_parallel_rank,
        app_state.model_parallel_size,
        _,
    ) = fake_initialize_model_parallel(
        world_size=app_state.model_parallel_size,
        rank=trainer.global_rank,
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )
    # inject model parallel rank
checkpoint_path = inject_model_parallel_rank(os.path.join(checkpoint_dir, checkpoint_name))

#     model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)

[NeMo I 2022-03-23 20:36:34 megatron_init:186] Rank 0 has data parallel group: [0]
[NeMo I 2022-03-23 20:36:34 megatron_init:189] All data parallel group ranks: [[0], [1]]
[NeMo I 2022-03-23 20:36:34 megatron_init:190] Ranks 0 has data parallel rank: 0
[NeMo I 2022-03-23 20:36:34 megatron_init:198] Rank 0 has model parallel group: [0, 1]
[NeMo I 2022-03-23 20:36:34 megatron_init:199] All model parallel group ranks: [[0, 1]]
[NeMo I 2022-03-23 20:36:34 megatron_init:209] Rank 0 has tensor model parallel group: [0, 1]
[NeMo I 2022-03-23 20:36:34 megatron_init:213] All tensor model parallel group ranks: [[0, 1]]
[NeMo I 2022-03-23 20:36:34 megatron_init:214] Rank 0 has tensor model parallel rank: 0
[NeMo I 2022-03-23 20:36:34 megatron_init:228] Rank 0 has pipeline model parallel group: [0]
[NeMo I 2022-03-23 20:36:34 megatron_init:240] Rank 0 has embedding group: [0]
[NeMo I 2022-03-23 20:36:34 megatron_init:246] All pipeline model parallel group ranks: [[0], [1]]
[NeMo I 2022-03-23 20:36

In [19]:
model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=hparams_file, trainer=trainer)

[NeMo I 2022-03-23 20:36:55 megatron_init:186] Rank 0 has data parallel group: [0]
[NeMo I 2022-03-23 20:36:55 megatron_init:189] All data parallel group ranks: [[0], [1]]
[NeMo I 2022-03-23 20:36:55 megatron_init:190] Ranks 0 has data parallel rank: 0
[NeMo I 2022-03-23 20:36:55 megatron_init:198] Rank 0 has model parallel group: [0, 1]
[NeMo I 2022-03-23 20:36:55 megatron_init:199] All model parallel group ranks: [[0, 1]]
[NeMo I 2022-03-23 20:36:55 megatron_init:209] Rank 0 has tensor model parallel group: [0, 1]
[NeMo I 2022-03-23 20:36:55 megatron_init:213] All tensor model parallel group ranks: [[0, 1]]
[NeMo I 2022-03-23 20:36:55 megatron_init:214] Rank 0 has tensor model parallel rank: 0
[NeMo I 2022-03-23 20:36:55 megatron_init:228] Rank 0 has pipeline model parallel group: [0]
[NeMo I 2022-03-23 20:36:55 megatron_init:240] Rank 0 has embedding group: [0]
[NeMo I 2022-03-23 20:36:55 megatron_init:246] All pipeline model parallel group ranks: [[0], [1]]
[NeMo I 2022-03-23 20:36

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.


[NeMo I 2022-03-23 20:36:58 megatron_gpt_model:1203] Padded vocab_size: 50432, original vocab_size: 50257, dummy tokens: 175.


In [20]:
def pad_collate(batch):
    tokens, tokens_to_generate = batch[0]['data'], batch[0]['tokens_to_generate']
    compute_logprobs = batch[0]['compute_logprobs']
    lens = [len(token) for token in tokens]

    tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=50256)
    data = []

    if 'prompt_tags' in batch[0]:
        # Keep track of soft prompt tags
        prompt_tags = batch[0]['prompt_tags']

        for token, lenn, prompt_tag in zip(tokens_pad.T, lens, prompt_tags):
            data.append((token, lenn, tokens_to_generate, compute_logprobs, prompt_tag))
    else:
        for token, lenn in zip(tokens_pad.T, lens):
            data.append((token, lenn, tokens_to_generate, compute_logprobs))

    return data


In [21]:
request = []
prompt = "Translate German to English: Ich bin müde"
request.append(prompt)

In [22]:
tokens_to_generate = 100
compute_logprobs = True
batch_size = 8

In [23]:
dataset = GPTRequestDataset(request, model.tokenizer, tokens_to_generate, compute_logprobs)
request_dl = DataLoader(dataset=pad_collate(dataset), batch_size=int(batch_size))

In [24]:
!nvidia-smi

Wed Mar 23 20:37:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:1A:00.0 Off |                    0 |
| N/A   29C    P0    33W / 250W |  16573MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:1B:00.0 Off |                    0 |
| N/A   26C    P0    22W / 250W |      4MiB / 32510MiB |      0%      Default |
|       

In [None]:
dir(model)

In [None]:
 response = trainer.predict(model, request_dl)

initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
