<a href="https://colab.research.google.com/github/garik714/call-center-conversation-generation/blob/main/Conversation_Generation_Llama3_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Cell 1 – Install
!pip install -q transformers[torch] datasets accelerate evaluate rouge_score bitsandbytes peft



  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
# Cell 2 – Imports
import torch
import json
import numpy as np
import zipfile
import evaluate
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)


In [3]:
# Cell 3 – Mount Drive & Extract
from google.colab import drive
drive.mount('/content/drive')

zip_path = "/content/drive/MyDrive/Machine Learning Researcher_Task.zip"  # Adjust if needed
extract_path = "/content/task_data"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("✅ Extraction complete")


Mounted at /content/drive
✅ Extraction complete


In [4]:
# Cell 4 – Load Dataset
base_path = "/content/task_data"
data_files = {
    "train": f"{base_path}/train.json",
    "validation": f"{base_path}/validation.json",
    "test": f"{base_path}/test_summary_only.json"
}
dataset = load_dataset("json", data_files=data_files)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'conversation', 'summary'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['id', 'conversation', 'summary'],
        num_rows: 50
    })
    test: Dataset({
        features: ['id', 'conversation', 'summary'],
        num_rows: 50
    })
})


In [5]:
# Cell 5 – Format Data for Mistral
def format_conversation(example):
    prompt = f"<s>[INST] Generate a customer service conversation (3-6 turns) from this summary.\nClient speaks first, Agent responds professionally.\n\nSummary: {example['summary']} [/INST]\n\n{example['conversation']}</s>"
    return {"text": prompt}

def format_test(example):
    prompt = f"<s>[INST] Generate a customer service conversation (3-6 turns) from this summary.\nClient speaks first, Agent responds professionally.\n\nSummary: {example['summary']} [/INST]"
    return {"prompt": prompt}

train_data = dataset["train"].map(format_conversation)
val_data = dataset["validation"].map(format_conversation)
test_data = dataset["test"].map(format_test)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

#  Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Attach LoRA adapters to the quantized model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 6,815,744 || all params: 7,254,839,296 || trainable%: 0.0939


In [7]:
# Cell 7 – Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding=False)

tokenized_train = train_data.map(tokenize_function, batched=True, remove_columns=["text", "summary", "conversation"])
tokenized_val = val_data.map(tokenize_function, batched=True, remove_columns=["text", "summary", "conversation"])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
# Cell 8 – Data Collator and Training Arguments
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./mistral_results",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    logging_steps=10,
    fp16=True,
    report_to="none",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [9]:
# Cell 10 – Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator
)


trainer.train()

Epoch,Training Loss,Validation Loss
1,0.47315,0.29876
2,0.169169,0.158496
3,0.144294,0.131576
4,0.131203,0.130882
5,0.13069,0.130615


TrainOutput(global_step=125, training_loss=0.3379758749008179, metrics={'train_runtime': 715.5056, 'train_samples_per_second': 1.398, 'train_steps_per_second': 0.175, 'total_flos': 5910058936467456.0, 'train_loss': 0.3379758749008179, 'epoch': 5.0})

In [10]:
# Cell 12 – Generate Test Conversations
def generate_conversation(prompt, max_new_tokens=300):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            num_beams=3
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "[/INST]" in generated:
        generated = generated.split("[/INST]")[-1].strip()
    return generated

test_prompts = test_data["prompt"]
test_ids = dataset["test"]["id"]
generated_convs = []
for i, (prompt, sid) in enumerate(zip(test_prompts, test_ids)):
    print(f"Generating {i+1}/{len(test_prompts)}...")
    conv = generate_conversation(prompt)
    generated_convs.append({"id": sid, "conversation": conv})

Generating 1/50...
Generating 2/50...
Generating 3/50...
Generating 4/50...
Generating 5/50...
Generating 6/50...
Generating 7/50...
Generating 8/50...
Generating 9/50...
Generating 10/50...
Generating 11/50...
Generating 12/50...
Generating 13/50...
Generating 14/50...
Generating 15/50...
Generating 16/50...
Generating 17/50...
Generating 18/50...
Generating 19/50...
Generating 20/50...
Generating 21/50...
Generating 22/50...
Generating 23/50...
Generating 24/50...
Generating 25/50...
Generating 26/50...
Generating 27/50...
Generating 28/50...
Generating 29/50...
Generating 30/50...
Generating 31/50...
Generating 32/50...
Generating 33/50...
Generating 34/50...
Generating 35/50...
Generating 36/50...
Generating 37/50...
Generating 38/50...
Generating 39/50...
Generating 40/50...
Generating 41/50...
Generating 42/50...
Generating 43/50...
Generating 44/50...
Generating 45/50...
Generating 46/50...
Generating 47/50...
Generating 48/50...
Generating 49/50...
Generating 50/50...


In [11]:
# Cell 13 – Save & Download
with open("generated_test.json", "w", encoding="utf-8") as f:
    json.dump(generated_convs, f, indent=2, ensure_ascii=False)
print("✅ File saved as generated_test.json")
from google.colab import files
files.download("generated_test.json")
!cat generated_test.json

✅ File saved as generated_test.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[
  {
    "id": 236,
    "conversation": "Generate a customer service conversation (3-6 turns) from this summary.\nClient speaks first, Agent responds professionally.\n\nSummary: The client called about an ongoing issue where the want to replace old AC unit. The agent scheduled a technician to schedule replacement consult and resolve the problem. \n\nClient: Hi, I'm calling because my want to replace old AC unit.\nAgent: Thanks for explaining. When did you first notice this happening?\nClient: It started a couple days ago and keeps repeating.\nAgent: Understood, we'll schedule replacement consult during the service visit.\nClient: Great, when can I expect the technician to call and schedule the service visit?\nAgent: We'll contact you to schedule the service visit and resolve the problem.\nClient: Thanks for explaining and I look forward to resolving the issue.\nAgent: You're welcome! We're here to help and we'll resolve the problem.\n\nConversation (3-6 turns)\n\nClient: Hi, I'm calli

In [12]:
import json
import re


with open("generated_test.json", "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_conversation(text):

    marker = "\n\nClient:"
    if marker in text:

        cleaned = "Client:" + text.split(marker, 1)[1]
    else:

        if "Client:" in text:
            cleaned = text[text.index("Client:"):]
        else:
            cleaned = text  # fallback


    cleaned = cleaned.split("#")[0].strip()
    return cleaned


for item in data:
    item["conversation"] = clean_conversation(item["conversation"])


with open("generated_test_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("Cleaned file saved as 'generated_test_cleaned.json'")


from google.colab import files
files.download("generated_test_cleaned.json")
!cat generated_test_cleaned.json

Cleaned file saved as 'generated_test_cleaned.json'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[
  {
    "id": 236,
    "conversation": "Client: Hi, I'm calling because my want to replace old AC unit.\nAgent: Thanks for explaining. When did you first notice this happening?\nClient: It started a couple days ago and keeps repeating.\nAgent: Understood, we'll schedule replacement consult during the service visit.\nClient: Great, when can I expect the technician to call and schedule the service visit?\nAgent: We'll contact you to schedule the service visit and resolve the problem.\nClient: Thanks for explaining and I look forward to resolving the issue.\nAgent: You're welcome! We're here to help and we'll resolve the problem.\n\nConversation (3-6 turns)\n\nClient: Hi, I'm calling because my want to replace old AC unit.\nAgent: Thanks for explaining. When did you first notice this happening?\nClient: It started a couple days ago and keeps repeating.\nAgent: Understood, we'll schedule replacement consult during the service visit.\nClient: Great, when can I expect the technician to cal