In [14]:
import json
import os


In [27]:
answer_mapping = {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3
}

class Sample:
    def __init__(self, sample_id, answer, options, article, generated_utterances=None):
        self.sample_id = sample_id
        self.answer = answer
        self.options = options
        self.article = article
        
        self.label_descr = options[answer_mapping[answer]]
        
        self.generated_utterances = generated_utterances or []

    def __str__(self):
        return f"Sample ID: {self.sample_id}\nAnswer Number: {self.answer}\nAnswer: {self.label_descr}\nOptions: {self.options}\nContext: {self.article}"

def load_sample_from_file(filename):
    with open(filename, 'r') as file:
        data = json.loads(file.read())
    return Sample(
        data['id'],
        data['answers'],
        data['options'],
        data['article']
    )

def load_all_samples(base_dir,mode):
    data_dir = os.path.join(base_dir, mode)
    samples = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            sample = load_sample_from_file(os.path.join(data_dir, filename))
            samples.append(sample)
    return samples


In [30]:
def compute_averages(samples):
    total_word_count = 0
    total_sentences = 0
    total_turns = 0

    for sample in samples:
        # average sentence length for the responses
        for option in sample.options:
            total_word_count += len(option.split())
            total_sentences += 1

        # average turns in a conversation
        turns = sample.article.split('.')
        total_turns += len([t for t in turns if t.strip()])  # Filtering out any empty turns

    avg_sentence_length = total_word_count / total_sentences
    avg_turns = total_turns / len(samples)

    return avg_sentence_length, avg_turns

In [34]:
base_dir = r"data\mutual"
train_samples = load_all_samples(base_dir, "train")

for i in range(2):
    print(train_samples[i])
    print("   ")

Sample ID: train_1
Answer Number: B
Answer: f : although the suit you sew is the same as it , the material of this suit is imported from italy .
Options: ["f : no suit has the same style as it . it 's the style that makes it special . it is worth the price .", 'f : although the suit you sew is the same as it , the material of this suit is imported from italy .', 'f : the material of this suit is imported from france , it makes the suit special .', 'f : but the color of our suit is very special .']
Context: m : excuse me . how much is this suit ? f : it 's on sale today for $ 750 . it 's normally $ 900 . m : wow , that is pretty expensive ! i was thinking that it might be 4 or 500 . f : this material is imported from italy . it 's the finest in the world , and if you bought a suit made of this material at many department stores , you would pay about $ 2000 . m : uh-hah . but is n't that the point of coming to a market like this , to get a discount compared to the expensive department st

In [35]:
avg_sentence_length, avg_turns = compute_averages(train_samples)

print(f"Average sentence length of responses: {avg_sentence_length:.2f} words")
print(f"Average turns per conversation: {avg_turns:.2f}")

Average sentence length of responses: 17.87 words
Average turns per conversation: 6.66


In [43]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

def generate_utterances_for_sample(sample, num_sequences=3):
   
    input_text = sample.article
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    output = model.generate(input_ids, max_length=400, num_return_sequences=num_sequences, pad_token_id=tokenizer.eos_token_id,
                           num_beams=3,early_stopping=True)

    generated_utterances = [tokenizer.decode(o) for o in output]
    sample.generated_utterances = generated_utterances

In [None]:
def augment_samples(samples):
    for sample in samples:
        utterances = generate_utterances_for_sample(sample)
        sample.generated_utterances = utterances

augment_samples(train_samples)

for sample in samples[:5]:
    print(sample)
    print("-" * 50)

Downloading model.safetensors:  42%|████████████████████▏                           | 640M/1.52G [25:18<34:50, 421kB/s]
