In [None]:
pip install transformers

In [None]:
pip install transformers[torch]

In [None]:
pip install datasets transformers pandas

In [None]:
pip install sentencepiece

In [None]:
pip install evaluate

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict
import evaluate
import json

In [2]:
#laod the dataset 
with open(r'C:\Users\Jlngo\BTT- Axle Informatic\documents_with_descriptions.json', 'r') as file:
    raw_data = json.load(file)

In [None]:
raw_data

In [3]:
data = {
    "input": [
         f"Create a CWL pipeline for {entry['metadata'].get('description', 'an unspecified task')} using {entry['metadata'].get('baseCommand', 'a specific tool')}"
        for entry in raw_data
    ],
    "target_text": [entry['page_content'] for entry in raw_data]
} 

In [None]:
data

In [4]:
dataset = Dataset.from_dict(data)

In [5]:
dataset = dataset.train_test_split(test_size = 0.2, seed =32)

In [6]:
test_valid_split = dataset['test'].train_test_split(test_size = 0.5, seed = 32)

In [7]:
dataset = DatasetDict({
    'train' : dataset['train'],
    'validation': test_valid_split['train'],
    'test' : test_valid_split['test']
})

In [8]:
#tokenize data 
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [9]:
def preprocess_data(examples):
    inputs = tokenizer(examples["input"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=512, truncation=True, padding="max_length").input_ids
    inputs["labels"] = targets
    return inputs

In [10]:
# Apply tokenization
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/628 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

In [None]:
pip install absl-py rouge_score nltk

In [11]:
#load the evaluation metric: ROUGE USED TO COPARE THE MACHINE GENERATED TEXT TO ONE OR MORE REFERENCE TEXTS CREATED BY HUMANS 
#used to measure the quality og machine translation and automatic summarization software
rouge = evaluate.load('rouge')

ROUGE scores provide an evaluation of how similar the generated text is to the reference text, which serves as a measure of quality rather than strict accuracy.

In [20]:
#computing the ROUGE score 
def compute_metrics(eval_pred):
    predictions , labels = eval_pred
    #decode prediction and label 
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    #compute ROUGE 
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}  # Convert to percentage
     # Print results for each epoch
    print(f"Epoch evaluation result: {result}")
    return result

In [17]:
# Initialize model and training arguments
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
training_args = Seq2SeqTrainingArguments(
    output_dir="flan-t5-cwl-finetuned",
    eval_strategy="epoch",
    learning_rate=0.01, #edit this if necessary 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01, #edit this if necassary 
    save_total_limit=3,
    num_train_epochs=1, # edit this if necessary 
    predict_with_generate=True
)

In [18]:
# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics  # Use the custom compute_metrics function
)


In [21]:
trainer.train()

KeyboardInterrupt: 

In [None]:
# Save the model and tokenizer
model.save_pretrained("flan-t5-cwl-finetuned")
tokenizer.save_pretrained("flan-t5-cwl-finetuned")

In [None]:
#assessing the genralization capacity
train_results = trainer.evaluate(tokenized_dataset["train"])
print("Training Set Results:", train_results)