## Runs Analysis

In [1]:
import sys
sys.path.append("/home/ubuntu/ConvSim")

In [2]:
from src.data_classes import ConversationalTurn
import shelve

In [3]:
dataset = []

In [4]:
run_name = "cast_cq_with_feedback"
with shelve.open(f"../data/generated_conversations/{run_name}/turns_db") as db:
    for turn_id in db:
        conversational_turn = db[turn_id]
        if conversational_turn.user_utterance_type == "feedback":
            # build conversation list
            conversation = []
            for historical_turn in conversational_turn.conversation_history:
                conversation.append(historical_turn['utterance'])
            conversation.append(conversational_turn.user_utterance)
            rewritten_utterance = conversational_turn.rewritten_utterance.replace("USER: ", "").strip()
            dataset.append({
                "conversation": conversation,
                "rewrite": rewritten_utterance,
            })

In [5]:
run_name = "cast_rewrites_no_feedback"
with shelve.open(f"../data/generated_conversations/{run_name}/turns_db") as db:
    for turn_id in db:
        conversational_turn = db[turn_id]
        conversation = []
        for historical_turn in conversational_turn.conversation_history:
            conversation.append(historical_turn['utterance'])
        conversation.append(conversational_turn.user_utterance)
        rewritten_utterance = conversational_turn.rewritten_utterance.replace("USER: ", "").strip()
        dataset.append({
            "conversation": conversation,
            "rewrite": rewritten_utterance,
        })

In [6]:
run_name = "cast_response_with_feedback"
with shelve.open(f"../data/generated_conversations/{run_name}/turns_db") as db:
    for turn_id in db:
        conversational_turn = db[turn_id]
        conversation = []
        for historical_turn in conversational_turn.conversation_history:
            conversation.append(historical_turn['utterance'])
        conversation.append(conversational_turn.user_utterance)
        rewritten_utterance = conversational_turn.rewritten_utterance.replace("USER: ", "").strip()
        dataset.append({
            "conversation": conversation,
            "rewrite": rewritten_utterance,
        })

In [7]:
dataset[0]

{'conversation': ['I remember Glasgow hosting COP26 last year, but unfortunately I was out of the loop. What was it about?',
  'The COP26 event is a global united Nations summit about climate change and how countries are planning to tackle it. The term “climate change” is often used as if it means the same thing as the term “global warming”. The National Academy of Sciences says “climate change” is growing in favor of “global warming” because it helps convey that there are other changes in addition to rising temperatures. In fact, “climate change” means major changes in temperature, rainfall, snow, or wind patterns lasting for decades or longer.',
  'Interesting. What are the effects of these changes?',
  'Are you looking for information on the effects of climate change on ecosystems, human health, or global economy?',
  "Yes, that's what I'm interested in."],
 'rewrite': 'What are the impacts of climate change on the world?'}

In [8]:
from transformers import AutoTokenizer
from datasets import Dataset

MODEL_NAME = "castorini/t5-base-canard" # "t5-base" #"t5-base" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
dataset = Dataset.from_list(dataset)
# dataset = dataset.select(range(10000))
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.1)

  from .autonotebook import tqdm as notebook_tqdm
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_NAME)
rouge = evaluate.load("rouge")

2024-01-25 17:23:21.817775: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-25 17:23:21.820173: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-25 17:23:21.872368: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
prefix = ""

def preprocess_function(examples):
    parsed_conversations = [" ||| ".join(conversation) for conversation in examples['conversation']]
    print(parsed_conversations[0])
    inputs = [prefix + conv for conv in parsed_conversations]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["rewrite"], max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


In [11]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/548 [00:00<?, ? examples/s]



Map: 100%|██████████| 548/548 [00:00<00:00, 2227.33 examples/s]
Map: 100%|██████████| 61/61 [00:00<00:00, 1998.82 examples/s]

What should I consider when buying a phone? ||| The design of the phone and the overall look and feel of the phone are very important. You should be comfortable with the way the phone looks and feels when you hold it in your hand. In addition, don’t be afraid to get a phone from a different manufacturer than you’re used to. Consider an older, used, or refurbished phone to save money. Sometimes a year-old or even a two-year-old device might give you everything you need. ||| I've heard iPhones look and feel great. Should I get one? ||| To select a cell phone, it is important to research several factors, including price, early termination fees, network availability, and other factors. Choosing a phone with a solid foundation will help you choose a plan that truly fits your needs. Price is also important to consider, as it can lead to a premium. Storage is also important, as it can slow down and cause problems, so it is important to research and compare prices. Additionally, showcasing a d




In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, device_map="auto")

In [13]:
MODEL_NAME = MODEL_NAME.replace("/", "-")
TUNED_MODEL_NAME = f"../../data/models/tuned-{MODEL_NAME}-rewriter-v2"

training_args = Seq2SeqTrainingArguments(
    output_dir=TUNED_MODEL_NAME,
    evaluation_strategy="epoch",
    # eval_steps=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    generation_max_length=128,
    #generation_num_beams=4,
    fp16=True,
    # auto_find_batch_size=True,
    # load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,No log,1.591093,0.303,0.1674,0.2872,0.2878,26.8033
1,No log,1.269357,0.4274,0.2435,0.3922,0.3958,13.7377
2,No log,1.175722,0.4521,0.2792,0.4282,0.4323,13.6066
4,No log,1.089726,0.4627,0.2779,0.4294,0.432,15.0656
5,No log,1.061272,0.4693,0.2903,0.4359,0.4392,14.9344
6,No log,1.041786,0.4892,0.3121,0.4525,0.4545,15.6885
8,No log,1.009362,0.5075,0.3309,0.4729,0.4756,13.623
9,No log,1.001888,0.5072,0.3244,0.4714,0.4738,13.3934
10,No log,0.998318,0.5122,0.3386,0.477,0.4803,15.6557
12,No log,0.985497,0.5121,0.3333,0.4741,0.476,15.5902


TrainOutput(global_step=340, training_loss=1.0856819601619945, metrics={'train_runtime': 3428.4876, 'train_samples_per_second': 3.197, 'train_steps_per_second': 0.099, 'total_flos': 1.258562293069824e+16, 'train_loss': 1.0856819601619945, 'epoch': 19.71})

In [15]:
from transformers import pipeline

rewriter = pipeline("text2text-generation", model=trainer.model, tokenizer=trainer.tokenizer)

In [23]:
rewriter("Can you help me find a diet for myself? ||| What kind of diet do you want? ||| I want something that is vegan-friendly, maintainable and not very hard to keep up", max_length=64)

[{'generated_text': 'Can you provide me with a diet that is vegan-friendly, maintainable and not very hard to keep up with?'}]

In [17]:
trainer.save_model()