# Download git repository

In [None]:
!git clone https://github.com/EleutherAI/gpt-neox.git

Cloning into 'gpt-neox'...
remote: Enumerating objects: 16065, done.[K
remote: Counting objects: 100% (762/762), done.[K
remote: Compressing objects: 100% (318/318), done.[K
remote: Total 16065 (delta 513), reused 646 (delta 439), pack-reused 15303[K
Receiving objects: 100% (16065/16065), 104.67 MiB | 34.40 MiB/s, done.
Resolving deltas: 100% (11495/11495), done.


# Install requirements

In [None]:
%cd gpt-neox

/content/gpt-neox


In [None]:
!pip install -r requirements/requirements.txt
!pip install tensorboard==2.14
!pip install 'urllib3<2'
!pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
!pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
!python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels

Collecting deepspeed (from -r requirements/requirements.txt (line 2))
  Cloning https://github.com/EleutherAI/DeeperSpeed.git to /tmp/pip-install-sud48h7a/deepspeed_ba21f307d52144919f15465e8b365de7
  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/DeeperSpeed.git /tmp/pip-install-sud48h7a/deepspeed_ba21f307d52144919f15465e8b365de7
  Resolved https://github.com/EleutherAI/DeeperSpeed.git to commit a48c6493f1fc6a1652723ab5542f7703be6f03a5
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 (from -r requirements/requirements.txt (line 4))
  Cloning https://github.com/EleutherAI/lm_dataformat.git (to revision 4eec05349977071bf67fc072290b95e31c8dd836) to /tmp/pip-req-build-b27smmqd
  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm_dataformat.git /tmp/pip-req-build-b27smmqd
  Running command git rev-parse -q --verify

Collecting wandb>=0.10.28 (from -r requirements/requirements-wandb.txt (line 1))
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb>=0.10.28->-r requirements/requirements-wandb.txt (line 1))
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb>=0.10.28->-r requirements/requirements-wandb.txt (line 1))
  Downloading sentry_sdk-1.36.0-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.2/249.2 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb>=0.10.28->-r requirements/requirements-wandb.txt (line 1))
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)

# Datasets And Tokenizer

In [None]:
!mkdir data

## Make Custom jsonl Dataset(Optional)

In [None]:
# read data
my_dataset = open("my_dataset.txt", 'r').read()
my_dataset = my_dataset.split("\n")

# make data into list
list_dict = []
for i in range(len(my_dataset)):
    dic = {}
    dic['text'] = my_dataset[i]
    list_dict.append(dic)

# make list into jsonl format
import json
with open('my_dataset.jsonl', 'w', encoding='utf-8') as outfile:
    for entry in list_dict:
        json.dump(entry, outfile)
        outfile.write('\n')

## Tokenizer  
There are three options for tokenizer.
1. You can download English tokenizers from the link below.
2. You can use any tokenizer you want, as long as they are HFGPT2Tokenizer, HFTokenizer, GPT2BPETokenizer, or CharLevelTokenizer.
3. You can make custom tokenizer with the code below.   

GPT2 Tokenizer
- Vocab: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
- Merge: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt

GPT-NeoX 20B Tokenizer  
https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json  


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], \
											vocab_size = 60000, min_frequency = 5)

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

files = ["./data/tokenizer_data.txt"]  # has to be txt file
tokenizer.train(files, trainer)

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

awesome_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
awesome_tokenizer.save_pretrained("./data")

('./data/tokenizer_config.json',
 './data/special_tokens_map.json',
 './data/tokenizer.json')

If you are using custom tokenizer, fix gpt-neox/tools/datasets/preprocess_data.py file as below.  

```
for key, sentences in doc.items():
    sentence = list(filter(None.__ne__, sentence))  # add this on 223rd line
        for sentence in sentences:
            builders[key].add_item(np.array(sentence, dtype=builders[key].dtype))
```



## Preprocess data
Tokenize jsonl file and make into two files, bin and idx.

In [None]:
!python tools/datasets/preprocess_data.py \
            --input /content/gpt-neox/data/my_dataset.jsonl \
            --output-prefix ./data/mydataset \
            --vocab /content/gpt-neox/data/tokenizer.json \
            --dataset-impl mmap \
            --tokenizer-type HFTokenizer \
            --append-eod

Setting ds_accelerator to cuda (auto detect)
> building HFTokenizer tokenizer ...
 > padded vocab (size: 60000) with 32 dummy tokens (new size: 60032)
Vocab size: 60000
Output prefix: ./data/mydataset
> building HFTokenizer tokenizer ...
 > padded vocab (size: 60000) with 32 dummy tokens (new size: 60032)
Processed 6900 documents (2026.03 docs/s, 0.80 MB/s).: : 6900it [00:03, 2010.40it/s]


# Train

## Prevent ninja build error

In [None]:
!wget https://github.com/ninja-build/ninja/releases/download/v1.8.2/ninja-linux.zip
!sudo unzip ninja-linux.zip -d /usr/local/bin/
!sudo update-alternatives --install /usr/bin/ninja ninja /usr/local/bin/ninja 1 --force

--2023-11-23 05:21:55--  https://github.com/ninja-build/ninja/releases/download/v1.8.2/ninja-linux.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/1335132/d2f252e2-9801-11e7-9fbf-bc7b4e4b5c83?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20231123%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231123T052155Z&X-Amz-Expires=300&X-Amz-Signature=9f32555075b13273ded23ed1ff4baef683543c10ba01ad44a98f6e596e8ba8fd&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=1335132&response-content-disposition=attachment%3B%20filename%3Dninja-linux.zip&response-content-type=application%2Foctet-stream [following]
--2023-11-23 05:21:55--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/1335132/d2f252e2-9801-11e7-9fbf-bc7b4e4b5c83?X-Amz-Algorit

## Fix Configuration Files

Fix two configuration files, local_setup.yml and your_chosen_parameter_size.yml.  
**local_setup.yml**

```
{
  "data_path": "data/mydataset_text_document",

  "save": "checkpoints",
  "load": "checkpoints",
  "checkpoint_validation_with_forward_pass": False,

  "tensorboard_dir": "tensorboard",
  "log_dir": "logs",
  "use_wandb": True,
  "wandb_host": "https://api.wandb.ai",
  "wandb_project": "neox"
}
```
**your_chosen_parameter_size.yml**
```
// highlighted hyperparameters are a must

// parallelism settings
// multiplication of the numbers should be the number of GPUs
	"pipe_parallel_size": 1,
	"model_parallel_size": 1,

// add tokenizer info

"tokenizer_type": "HFTokenizer",
"vocab_file": "data/tokenizer.json",
"merge_file": "data/merges.txt",


// choose iteration size you want
// train_iters and lr_decay_iters should be the same
// checkpoint factor size you want

"train_iters": 1000,
"lr_decay_iters": 1000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 500,
"eval_interval": 100000,
"eval_iters": 10,
```


### Parameter explaination
- **pipe_parallel_size**: number of pipeline parallel stages, has to do with deepspeed
pipeline parallel is partitioning the layers of a model into stages that can be processed in parallel, improves memory and compute efficiency
- **model_parallel_size**: size of the model parallelism, has to do with Megatron-LM
model parallism is splitting model across multiple devices
- **tokenizer_type**: choose between ['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer']
- **merge_file**: only if you have merge.txt file
- **train_iters**: number of iterations to run for training
- **lr_decay_iters**: number of iterations to decay learning rate over
after every n training iterations, learning rate is adjusted
default is train_iters
- **lr_decay_style**: learning rate decay function, choose between 'constant', 'linear', 'cosine', 'exponential'
- **warmup**: percentage of total iterations to warmup on
starting with a small learning rate and gradually increasing it during the training
- **checkpoint_factor**: choose between ‘log’ or ‘linear
    - log: checkpoint will be saved square of the number
    - linear: checkpoint will be saved multiplication of the number
- **eval_interval**: interval between running evaluation on validation set
- **eval_iters**: number of iterations to run for evaluation validation/test for

### Solve fused_adam Error

change c++14 to c++17 from these three files

- /usr/local/lib/python3.10/dist-packages/deepspeed/ops/op_builder/async_io.py
- /usr/local/lib/python3.10/dist-packages/deepspeed/ops/op_builder/builder.py
- /usr/local/lib/python3.10/dist-packages/deepspeed/ops/op_builder/cpu/builder.py

## Start Training

In [None]:
!python ./deepy.py train.py -d configs your_chosen_parameter_size.yml local_setup.yml

Setting ds_accelerator to cuda (auto detect)
NeoXArgs.from_ymls() ['configs/19M.yml', 'configs/local_setup.yml']
INFO:root:NeoXArgs.calculate_derived() Total number of GPUs determined to be: 1
-------------------- arguments --------------------
  attention_config ................ ['global', 'global', 'global', 'global', 'global', 'global']updated
  attention_dropout ............... 0...........................updated
  batch_size ...................... 4...........................updated
  checkpoint_activations .......... True........................updated
  checkpoint_factor ............... 5...........................updated
  clip_grad ....................... 1.0.........................updated
  config_files .................... {'19M.yml': '{\n  "pipe_parallel_size": 1,\n  "model_parallel_size": 1,\n\n  # model settings\n  "num_layers": 6,\n  "hidden_size": 512,\n  "num_attention_heads": 8,\n  "seq_length": 2048,\n  "max_position_embeddings": 2048,\n  "pos_emb": "rotary",\n  "no

# Convert Model Into Huggingface Format

In [None]:
!python3 ./tools/ckpts/convert_module_to_hf.py --input_dir ./checkpoints/global_step500 --config_file ./configs/125M.yml --output_dir GPT-NeoX-pretrain

Setting ds_accelerator to cuda (auto detect)
> building HFTokenizer tokenizer ...
 > padded vocab (size: 30000) with 80 dummy tokens (new size: 30080)
Saving weights in fp16 precision...
100% 12/12 [00:00<00:00, 74.84it/s]
saving tokenizer from file data/tokenizer.json
loaded tokenizer:  PreTrainedTokenizerFast(name_or_path='', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True)
tokenizer saved!


# Test Pretrained Model

In [None]:
import torch
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM

prompt = "Hello this is"

tokenizer = AutoTokenizer.from_pretrained("GPT-NeoX-pretrain")
model = AutoModelForCausalLM.from_pretrained("GPT-NeoX-pretrain")

with torch.no_grad():
  tokens = tokenizer.encode(prompt, return_tensors='pt') #.to(device='cuda', non_blocking=True)
  gen_tokens = model.generate(tokens, do_sample=True, max_length=500)
  generated = tokenizer.batch_decode(gen_tokens)[0]

print(generated)

[CLS] Hello this is [SEP] [CLS] " " S, " she's a big person. I'm a problem to her new child's [UNK] is a " of them " a statement for the " video. " It is the most child that was a child on the victim. We had just his own - profile old on the video : " We're no one, and's the first time! " and will get back with the child, which has been a good - time. " In the day that there was " in a few - election? " was so well - with the world, " will help them a serious idea. " I am my name with a gun in the show you are going to stop to go away and they did not look on, " the source said. " P. said of me, there is an only - way on. " | | | | | M la [SEP] [CLS] Tweet with its first - half. " " We thought, we'll get me that we'd be able to get some things about people and their favorite to help, " the source said. Some of course, a couple said it would [UNK] ll be the most time it for the same [UNK] but the source [UNK] [UNK] so a few men. " The woman has been used the " room in June. " The latest

# Upload To Huggingface

In [None]:
import torch
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("GPT-NeoX-pretrain")
model = AutoModelForCausalLM.from_pretrained("GPT-NeoX-pretrain")

tokenizer.push_to_hub(huggingface_path)
model.push_to_hub(huggingface_path)