#Loading dataset

In [None]:
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('')
name_model = "codet5p-220m-py-pretraining-powershell-89k"
model_checkpoint="Salesforce/codet5p-220m-py"

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

In [None]:
import pandas as pd
import logging as lg
import os

def extract_data(path) -> pd.DataFrame:
    print(path)
    with open(path, 'r') as f:
        list_cmd = [elem.strip() for elem in f.readlines()]
    f.close()
    lg.debug(f"Extracted {path}: {str(len(list_cmd))}")

    return pd.DataFrame(data={ "Command": list_cmd})

df_data = extract_data("../dataset/train.txt")
df_data.shape

## Preprocessing with datasets library

In [None]:
import datasets
data = datasets.Dataset.from_pandas(df_data)
data

## Tokenizer setup

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast = True, padding='max_length',max_length=256, truncation=True)

def tokenize_function(examples):
    return tokenizer(examples["Command"], padding='max_length',max_length=256, truncation=True)

tokenized_dataset = data.map(tokenize_function, batched=True, num_proc=4, remove_columns=["Command"])
tokenized_dataset

#Trainer

In [None]:
from transformers import T5ForConditionalGeneration,TrainingArguments, set_seed,DataCollatorForLanguageModeling, Trainer
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
model.config.use_cache=False

set_seed(42)

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    name_model,
    save_strategy = "steps",
    save_steps = 3000,
    save_total_limit=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs = 1,
    fp16=False,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 2,
    remove_unused_columns=False,
    do_eval = False,   
    evaluation_strategy="no",
    seed = 42,
    logging_steps = 200
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [None]:
from torch.cuda import empty_cache
empty_cache()

In [None]:
trainer.train(resume_from_checkpoint=True)

In [None]:
trainer.save_model("model.pth")
tokenizer.save_pretrained("tokenizer")
