# Download git repository

In [None]:
!git clone https://github.com/EleutherAI/gpt-neox.git

# Install requirements

In [2]:
%cd gpt-neox

/content/gpt-neox


In [None]:
!pip install -r requirements/requirements.txt
!pip install tensorboard==2.14
!pip install 'urllib3<2'
!pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
!pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
!python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels

# Datasets And Tokenizer
Make data folder and put dataset inside the folder.The dataset must be in jsonl format. Else you can make your own dataset into jsonl dataset.

In [4]:
!mkdir data

## Make Custom jsonl Dataset(Optional)

In [None]:
# read data
my_dataset = open("my_dataset.txt", 'r').read()
my_dataset = my_dataset.split("\n")

# make data into list
list_dict = []
for i in range(len(my_dataset)):
    dic = {}
    dic['text'] = my_dataset[i]
    list_dict.append(dic)

# make list into jsonl format
import json
with open('my_dataset.jsonl', 'w', encoding='utf-8') as outfile:
    for entry in list_dict:
        json.dump(entry, outfile)
        outfile.write('\n')

## Tokenizer  
There are three options for tokenizer.
1. You can download English tokenizers from the link below.
2. You can use any tokenizer you want, as long as they are HFGPT2Tokenizer, HFTokenizer, GPT2BPETokenizer, or CharLevelTokenizer.
3. You can make custom tokenizer with the code below.   

GPT2 Tokenizer
- Vocab: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
- Merge: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt

GPT-NeoX 20B Tokenizer  
https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json  


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], \
											vocab_size = 60000, min_frequency = 5)

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

files = ["./data/tokenizer_data.txt"]  # has to be txt file
tokenizer.train(files, trainer)

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

awesome_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
awesome_tokenizer.save_pretrained("./data")

If you are using custom tokenizer, fix gpt-neox/tools/datasets/preprocess_data.py file as below.  

```
for key, sentences in doc.items():  # from 223rd line
    for sentence in sentences:
        sentence = list(filter(None.__ne__, sentence))  # add this line
```



## Preprocess data
Tokenize jsonl file and make into two files, bin and idx.

In [None]:
!python tools/datasets/preprocess_data.py \
            --input /content/gpt-neox/data/my_dataset.jsonl \  # input file
            --output-prefix ./data/mydataset \  # output file name
            --vocab /content/gpt-neox/data/tokenizer.json \  # tokenizer file
            --dataset-impl mmap \  # dataset implementation, you can choose between lazy, cached, and mmap
            --tokenizer-type HFTokenizer \
            --append-eod

# Train

## Prevent ninja build error

In [None]:
!wget https://github.com/ninja-build/ninja/releases/download/v1.8.2/ninja-linux.zip
!sudo unzip ninja-linux.zip -d /usr/local/bin/
!sudo update-alternatives --install /usr/bin/ninja ninja /usr/local/bin/ninja 1 --force

## Fix Configuration Files

Fix two configuration files, local_setup.yml and your_chosen_parameter_size.yml in gpt-neox/configs folder.  
Make sure to move tokenizer information from local_setup.yml and move it to your_chosen_parameter_size.yml. 

**local_setup.yml**

```json
{
  "data_path": "data/mydataset_text_document",

  "save": "checkpoints",
  "load": "checkpoints",
  "checkpoint_validation_with_forward_pass": False,

  "tensorboard_dir": "tensorboard",
  "log_dir": "logs",
  "use_wandb": True,
  "wandb_host": "https://api.wandb.ai",
  "wandb_project": "neox"
}
```
**your_chosen_parameter_size.yml**
```json
{
// parallelism settings
// multiplication of the numbers should be the number of GPUs
	"pipe_parallel_size": 1,
	"model_parallel_size": 1,

// add tokenizer info
"tokenizer_type": "HFTokenizer",
"vocab_file": "data/tokenizer.json",
"merge_file": "data/merges.txt",

// choose iteration size you want
// train_iters and lr_decay_iters should be the same
// checkpoint factor size you want
"train_iters": 1000,
"lr_decay_iters": 1000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 500,
"eval_interval": 100000,
"eval_iters": 10,
}
```


### Parameter explaination
- **pipe_parallel_size**: number of pipeline parallel stages, has to do with deepspeed / pipeline parallel is partitioning the layers of a model into stages that can be processed in parallel, improves memory and compute efficiency
- **model_parallel_size**: size of the model parallelism, has to do with Megatron-LM / model parallism is splitting model across multiple devices
- **tokenizer_type**: choose between ['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer']
- **merge_file**: use only if you have merge.txt file
- **train_iters**: number of iterations to run for training
- **lr_decay_iters**: number of iterations to decay learning rate over / after every n training iterations, learning rate is adjusted, default is train_iters
- **lr_decay_style**: learning rate decay function, choose between 'constant', 'linear', 'cosine', 'exponential'
- **warmup**: percentage of total iterations to warmup on, starting with a small learning rate and gradually increasing it during the training
- **checkpoint_factor**: choose between ‘log’ or ‘linear
    - log: checkpoint will be saved square of the number
    - linear: checkpoint will be saved multiplication of the number
- **eval_interval**: interval between running evaluation on validation set
- **eval_iters**: number of iterations to run for evaluation validation/test for

## Start Training

In [None]:
!python ./deepy.py train.py -d configs your_chosen_parameter_size.yml local_setup.yml

# Convert Model Into Huggingface Format

In [None]:
!python3 ./tools/ckpts/convert_module_to_hf.py \
            --input_dir ./checkpoints/global_your_checkpoint \
            --config_file ./configs/your_chosen_parameter_size.yml \
            --output_dir GPT-NeoX-pretrain

# Test Pretrained Model

In [None]:
import torch
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM

prompt = "Hello this is"

tokenizer = AutoTokenizer.from_pretrained("GPT-NeoX-pretrain")
model = AutoModelForCausalLM.from_pretrained("GPT-NeoX-pretrain")

with torch.no_grad():
  tokens = tokenizer.encode(prompt, return_tensors='pt').to(device='cuda', non_blocking=True)
  gen_tokens = model.generate(tokens, do_sample=True, max_length=500)
  generated = tokenizer.batch_decode(gen_tokens)[0]

print(generated)

# Upload To Huggingface

In [None]:
import torch
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("GPT-NeoX-pretrain")
model = AutoModelForCausalLM.from_pretrained("GPT-NeoX-pretrain")

tokenizer.push_to_hub(huggingface_path)
model.push_to_hub(huggingface_path)