# Training
This notebook needs what is produced by the pre-training notebook, update paths where pre-trained models are present.
The fine tuning of pre-trained model of CodeGPT and CodeGEN has been conducted relyng on Microsoft's [CodeXGlue](https://github.com/microsoft/CodeXGLUE) framework, therefore the fine-tuning only needs to run:



## CodeGPT

In [None]:
!CUDA_LAUNCH_BLOCKING=1 python3 powershell-offensive-code-generation/text-to-code/Text-Code/text-to-code/code/run.py  \
        --data_dir="powershell-offensive-code-generation/notebooks/00-dataset/json/" \
        --langs="powershell" \
        --output_dir="./output" \
        --pretrain_dir='[INSERT PATH]' \
        --model_type=gpt2 \
        --do_train \
        --do_infer \
        --node_index 0 \
        --gpu_per_node 1 \
        --learning_rate=5e-5 \
        --weight_decay=0.01 \
        --block_size=1024 \
        --evaluate_during_training \
        --per_gpu_train_batch_size=1 \
        --per_gpu_eval_batch_size=4 \
        --gradient_accumulation_steps=4 \
        --num_train_epochs=30 \
        --logging_steps=200 \
        --save_steps=400 \
        --save_total_limit=1 \
        --overwrite_output_dir \
        --log_file='./content/text2code.log'\
        --seed=42 \
        --hf_token=""

## CodeGen

In [None]:
!CUDA_LAUNCH_BLOCKING=1 python3 powershell-offensive-code-generation/text-to-code/Text-Code/text-to-code/code/run.py  \
        --data_dir="powershell-offensive-code-generation/notebooks/00-dataset/json/" \
        --langs="powershell" \
        --output_dir="./output" \
        --pretrain_dir='[INSERT PATH]' \                 
        --model_type=codegen \
        --do_train \
        --do_infer \
        --node_index 0 \
        --gpu_per_node 1 \
        --learning_rate=5e-5 \
        --weight_decay=0.01 \
        --block_size=1024 \
        --evaluate_during_training \
        --per_gpu_train_batch_size=1 \
        --per_gpu_eval_batch_size=4 \
        --gradient_accumulation_steps=4 \
        --num_train_epochs=30 \
        --logging_steps=200 \
        --save_steps=400 \
        --save_total_limit=1 \
        --overwrite_output_dir \
        --log_file='./content/text2code.log'\
        --seed=42 \
        --hf_token=""

## Code t5 plus

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm as tqdm_bar

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

checkpoint = [PATH OF PRETRAINED MODEL]                                                                 # INSERT PATH
# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load the dataset
dataset = load_dataset('dessertlab/offensive-powershell')

# Preprocess the data
def preprocess(data):
    encoding = tokenizer(data['nl'], truncation=True, padding='max_length', max_length=512)
    labels = tokenizer(data['code'], truncation=True, padding='max_length', max_length=512).input_ids
    encoding['labels'] = labels
    return encoding

dataset = dataset.map(preprocess, batched=True)
dataset =  dataset.with_format("torch", device=device)
# Split the dataset
train_dataset = dataset['train']
val_dataset = dataset['dev']

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5,weight_decay=0.01)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)



In [None]:
# Training loop

accumulation_steps = 2
num_epochs = 30


model.train(mode=True)
for epoch in range(num_epochs):
    total_loss = 0.0

    with tqdm_bar(total=len(train_loader), desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            # Accumulate gradients

            if (batch_idx + 1) % accumulation_steps == 0 or batch_idx == len(train_loader) - 1:
                optimizer.step()
                optimizer.zero_grad()

            pbar.update(1)

            pbar.set_postfix({'Loss': loss.item()})

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                labels = batch['labels'].to(model.device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
        print(f'Epoch: {epoch}, Validation Loss: {val_loss/len(val_loader)}')