<a href="https://colab.research.google.com/github/buckdan/GPasT/blob/main/GPasT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install all of the important library
!pip3 install tokenizers transformers datasets
# Check for GPU
!nvidia-smi
# Create the necessary folders
!mkdir tokenizer
!mkdir models

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 KB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
 

In [3]:
# Tokenizer. Make sure to run this before running the belows
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, \
                            Trainer, TrainingArguments
from datasets import load_dataset

data_paths = ["pascal_dataset_text_code.txt"] # Download the dataset from HuggingFace ("Falcon2006VN/pascal-code-generation-2mb")

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=data_paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<S>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model("tokenizer")

tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")

tokenizer.add_special_tokens({
    "eos_token" : "</s>",
    "bos_token" : "<s>",
    "unk_token" : "<unk>",
    "pad_token" : "<pad>",
    "mask_token" : "<mask>"
})

# Testing the Tokenizer
#
inp = "writeln('Hello World'!);"
t = tokenizer.encode(inp)
print(t)
print(tokenizer.decode(t))
#

[2757, 438, 14317, 1345, 11, 5, 701]
writeln('Hello World'!);


In [None]:
# For training the model

from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, \
                            Trainer, TrainingArguments
from datasets import load_dataset

data_paths = ["pascal_dataset_text_code.txt"] # Download the dataset from HuggingFace ("Falcon2006VN/pascal-code-generation-2mb")

tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")

tokenizer.add_special_tokens({
    "eos_token" : "</s>",
    "bos_token" : "<s>",
    "unk_token" : "<unk>",
    "pad_token" : "<pad>",
    "mask_token" : "<mask>"
})

config = GPT2Config(
    vocab_size= tokenizer.vocab_size,
    bos_token = tokenizer.bos_token_id,
    eos_token = tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)

dataset = load_dataset("text", data_files=data_paths) # Use this line if you downloaded the dataset. Use the below
                                                      # if you want to load from HuggingFace

# dataset = load_dataset("Falcon2006VN/pascal-code-generation-2mb")
def encode(lines):
    return tokenizer(lines['text'], add_special_tokens=True, truncation=True, max_length=512)

dataset.set_transform(encode)
dataset = dataset['train']

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir="models",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=20,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()
trainer.save_model("GPasT")

In [4]:
# Running the model
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, \
                            Trainer, TrainingArguments
from datasets import load_dataset

data_paths = ["pascal_dataset_text_code.txt"] # Download the dataset from HuggingFace ("Falcon2006VN/pascal-code-generation-2mb")

tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")

tokenizer.add_special_tokens({
    "eos_token" : "</s>",
    "bos_token" : "<s>",
    "unk_token" : "<unk>",
    "pad_token" : "<pad>",
    "mask_token" : "<mask>"
})

config = GPT2Config(
    vocab_size= tokenizer.vocab_size,
    bos_token = tokenizer.bos_token_id,
    eos_token = tokenizer.eos_token_id,
)

# Remove the .to("cuda")
# If you want to run from CPU

model = GPT2LMHeadModel.from_pretrained("GPasT").to("cuda")

# Type exit or quit when you done testing the model

while True:
  inp = input(">>> ")
  if inp == "quit" or inp == "exit":
    break
  print("Generating code...")
  input_ids = tokenizer.encode(inp, return_tensors="pt").to("cuda")
  beam_output = model.generate(input_ids,
                               max_length = 512,
                               num_beams = 10,
                               temperature = 0.7,
                               no_repeat_ngram_size = 5,
                               num_return_sequences = 1)
  

  for beam in beam_output:
    output = tokenizer.decode(beam)
    fout = output.replace("<N>", "\n")
    print(str(fout))

>>> w


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generating code...
w
  end;


uses
  {$ENDIF}

  private
end;
  result:=0;

begin

    end;
  public
    procedure(L: string;
  end; cdecl;
    procedure class(L);
    function end;
    function string;
    begin
    result:=1: integer;
    public
      end;
      result:=1: string;

{$ENDIF}
    procedure end;
var
    function boolean;
    {1);
    procedure
    if end;
begin<T_getClassObject(L);

      begin
  if1);
  end);
    {_State): integer;
  if0;
   ;
    [1;
    end; override;
    property:=0;
  begin
      procedure end;
{$ENDIF;
    {$ENDIF}
  end,

function end;
   end;
end);
    [>
    procedure:=luaclass_getClassObject(L,1);
      end.Create;
    class(L: PLua_getClassObject(L: boolean;
      end
    {ENDIF}
      end,
    function:=0);
   >begin
    [);
    if lua_State): integer; overload;
    constructor;
   (L,1;
  lua_N>    procedure begin

{$;
    if string;
      public
  result1;
      functionN>    procedure result:=1);

{$);
    end);
  if lua_getClassObject(Se