# Models Pre-Training
## Installing requirements

In [None]:
! pip install -r powershell-offensive-code-generation/requirements.txt 

Note: **The pre-training procedure requires CUDA technology and APEX drivers**

## Code T5 plus Pre-training

In [None]:

model_checkpoint="Salesforce/codet5p-220m-py"

import pandas as pd
import logging as lg
import os

def extract_data(path) -> pd.DataFrame:
    print(path)
    with open(path, 'r') as f:
        list_cmd = [elem.strip() for elem in f.readlines()]
    f.close()
    lg.debug(f"Extracted {path}: {str(len(list_cmd))}")

    return pd.DataFrame(data={ "Command": list_cmd})

df_data = extract_data("powershell-offensive-code-generation/notebooks/00-dataset/pretraining/train.txt")
df_data.shape

In [None]:
import datasets
from transformers import AutoTokenizer


data = datasets.Dataset.from_pandas(df_data)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast = True, padding='max_length',max_length=256, truncation=True)

def tokenize_function(examples):
    return tokenizer(examples["Command"], padding='max_length',max_length=256, truncation=True)

tokenized_dataset = data.map(tokenize_function, batched=True, num_proc=4, remove_columns=["Command"])
tokenized_dataset

In [None]:
from transformers import T5ForConditionalGeneration,TrainingArguments, set_seed,DataCollatorForLanguageModeling, Trainer

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
model.config.use_cache=False

set_seed(42)

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    model,
    save_strategy = "steps",
    save_steps = 3000,
    save_total_limit=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs = 1,
    fp16=False,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 2,
    remove_unused_columns=False,
    do_eval = False,   
    evaluation_strategy="no",
    seed = 42,
    logging_steps = 200
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [None]:
trainer.train(resume_from_checkpoint=True)
trainer.save_model("codet5p-pretrained.pth")
tokenizer.save_pretrained("codet5p-tokenizer")

# CodeGen Pre-Train

In order to perform this type of procedure, it is necessary to utilise a machine equipped with an Nvidia GPU and at least 16 GB of VRAM. In point of fact, should the error message "torch.cuda.OutOfMemoryError: In the event that the error "CUDA out of memory" is encountered, it can be inferred that the available VRAM is insufficient.

In [None]:
!CUDA_LAUNCH_BLOCKING=1 python3 powershell-offensive-code-generation/text-to-code/CodeCompletion-token/code/run_lm.py  \
        --data_dir 'powershell-offensive-code-generation/notebooks/00-dataset/pretraining/' \
        --langs 'powershell' \
        --output_dir '.' \
        --pretrain_dir 'Salesforce/codegen-350M-multi' \
        --model_type=codegen \
        --block_size=256 \
        --do_train \
        --not_pretrain \
        --node_index 0 \
        --gpu_per_node 1 \
        --learning_rate=5e-5 \
        --weight_decay=0.01 \
        --per_gpu_train_batch_size=4 \
        --gradient_accumulation_steps=2 \
        --num_train_epochs=1 \
        --logging_steps=200 \
        --save_steps=4000 \
        --save_total_limit=1 \
        --overwrite_output_dir \
        --log_file='log_pretraining_codegen' \
        --seed=42 \
        --hf_token ''

## CodeGPT pre-training

In [None]:
!python3 powershell-offensive-code-generation/text-to-code/CodeCompletion-token/code/run_lm.py \
        --data_dir 'powershell-offensive-code-generation/notebooks/00-dataset/pretraining/' \
        --langs 'powershell' \
        --output_dir '.' \
        --pretrain_dir 'microsoft/CodeGPT-small-py' \
        --model_type=gpt2 \
        --block_size=256 \
        --do_train \
        --not_pretrain \
        --node_index 0 \
        --gpu_per_node 1 \
        --learning_rate=5e-5 \
        --weight_decay=0.01 \
        --per_gpu_train_batch_size=4 \
        --gradient_accumulation_steps=2 \
        --num_train_epochs=1 \
        --logging_steps=200 \
        --save_steps=10000 \
        --save_total_limit=1 \
        --overwrite_output_dir \
        --log_file='log_pretraining_codegpt' \
        --seed=42 \
        --hf_token ''