In [None]:
!pip install -U adapter-transformers sentencepiece
!pip install datasets
%pip install "accelerate>=0.16.0,<1" "transformers[torch]>=4.28.1,<5" "torch>=1.13.1,<2" datasets
%pip install diffusers
%pip install transformers
%pip install openai

In [None]:
#@title Textual model training
#@markdown The first step to perform the model training was, of course, selecting the model itself. <br><br> We wanted to use a model with good performances, but also simple enough to be suitable for our small-sized dataset
#@markdown and after several trials, we selected a text-generation model from google, called <i>flan-t5-base.</i><br>This specific model required the training set to be encoded and tokenized in a specific format, with the addition of a padding. This is the operation performed here.

from transformers import AutoTokenizer

base_model = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(base_model)
prefix = ''

# tokenize the dataset
def encode_batch(examples):
    # the name of the input column
    text_column = 'question'
    # the name of the target column
    summary_column = 'answer'
    # used to format the tokens
    padding = "max_length"

    inputs, targets = [], []
    for i in range(len(examples[text_column])):
        if examples[text_column][i] and examples[summary_column][i]:
            inputs.append(examples[text_column][i])
            targets.append(examples[summary_column][i])

    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)
    labels = tokenizer(targets, max_length=512, padding=padding, truncation=True)

    # rename to labels for training
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
#@markdown The function here defined applies the tokenization step to the training split
from datasets import load_dataset

# load the dataset
def load_split(split_name, max_items):
    # load the split
    dataset = load_dataset('csv', data_files='/content/preprocessed.csv', delimiter=',')[split_name]
    # only use the first max_items items
    dataset = dataset.filter(lambda _, idx: idx < max_items, with_indices=True)
    # tokenize the dataset
    dataset = dataset.map(
        encode_batch,
        batched=True,
        remove_columns=dataset.column_names,
        desc="Running tokenizer on " + split_name + " dataset",
    )
    # set the format to torch
    dataset.set_format(type="torch", columns=["input_ids", "labels"])

    return dataset

In [None]:
#@markdown Here, as final step in the training phase, we tune the parameters and load the pretrained model.<br>
#@markdown The selected parameters values are the following: <ul><li>learning_rate = 3e-4</li><li>training epochs = 1</li><li>batch size = 1</li></ul><br>
#@markdown In addition to this, <i>Low Rank Adaptation (LoRA)</i> configuration was exploited: this is a technique used to simplify the training of this kind of models on specific tasks
#@markdown <br> Then, at last, the training phase is started.

from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers import AutoModelForSeq2SeqLM
from transformers.adapters import LoRAConfig
import numpy as np


# start with the pretrained base model
model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model
)

# set the parameters for LoRA
config = LoRAConfig(
    r=8,
    alpha=16,
    intermediate_lora=True,
    output_lora=True
)


# small batch size to fit in memory
batch_size = 1

training_args = TrainingArguments(
    learning_rate=3e-4,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=load_split("train", 1000),
)

trainer.train()

In [None]:
#@title Inference phase
#@markdown After the model has been trained on our dataset, it is time to test its results, asking it to produce an advertisement slogan given the company name and the field where it operates<br><br>
#@markdown In addition to this, we wanted to check the quality of the obtained results by comparing our model generated slogans with the ones that can be obtained using the <i>OpenAI GPT-3.5 turbo</i><br><br>
#@markdown Use the following boxes to input the company name and the company field

company = "" #@param {type:"string"}
field = "" #@param {type:"string"}
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import openai

input_text = "What could it be a good advertising slogan for a company called " + company + "which operates in the " + field + "field?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
input_ids = input_ids.to(device)
outputs = model.generate(input_ids)

openai.api_key = "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"
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": input_text}
  ]
)



print(input_text)
print("Custom model output: " + tokenizer.decode(outputs[0]))
print("GPT-3.5 turbo output: " + completion.choices[0].message.content)