<a href="https://colab.research.google.com/github/garik714/call-center-conversation-generation/blob/main/Conversation_Generation_Llama3_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Cell 1 – Install
!pip install -q transformers[torch] datasets accelerate evaluate rouge_score bitsandbytes peft



In [None]:
# Cell 2 – Imports
import torch
import json
import numpy as np
import zipfile
import evaluate
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)


In [None]:
# Cell 3 – Mount Drive & Extract
from google.colab import drive
drive.mount('/content/drive')

zip_path = "/content/drive/MyDrive/Machine Learning Researcher_Task.zip"  # Adjust if needed
extract_path = "/content/task_data"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("✅ Extraction complete")


In [None]:
# Cell 4 – Load Dataset
base_path = "/content/task_data"
data_files = {
    "train": f"{base_path}/train.json",
    "validation": f"{base_path}/validation.json",
    "test": f"{base_path}/test_summary_only.json"
}
dataset = load_dataset("json", data_files=data_files)
print(dataset)

In [None]:
# Cell 5 – Format Data for Mistral
def format_conversation(example):
    prompt = f"<s>[INST] Generate a customer service conversation (3-6 turns) from this summary.\nClient speaks first, Agent responds professionally.\n\nSummary: {example['summary']} [/INST]\n\n{example['conversation']}</s>"
    return {"text": prompt}

def format_test(example):
    prompt = f"<s>[INST] Generate a customer service conversation (3-6 turns) from this summary.\nClient speaks first, Agent responds professionally.\n\nSummary: {example['summary']} [/INST]"
    return {"prompt": prompt}

train_data = dataset["train"].map(format_conversation)
val_data = dataset["validation"].map(format_conversation)
test_data = dataset["test"].map(format_test)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

#  Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Attach LoRA adapters to the quantized model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Cell 7 – Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding=False)

tokenized_train = train_data.map(tokenize_function, batched=True, remove_columns=["text", "summary", "conversation"])
tokenized_val = val_data.map(tokenize_function, batched=True, remove_columns=["text", "summary", "conversation"])

In [None]:
# Cell 8 – Data Collator and Training Arguments
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./mistral_results",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    logging_steps=10,
    fp16=True,
    report_to="none",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [None]:
# Cell 10 – Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator
)


trainer.train()

In [None]:
# Cell 12 – Generate Test Conversations
def generate_conversation(prompt, max_new_tokens=300):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            num_beams=3
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "[/INST]" in generated:
        generated = generated.split("[/INST]")[-1].strip()
    return generated

test_prompts = test_data["prompt"]
test_ids = dataset["test"]["id"]
generated_convs = []
for i, (prompt, sid) in enumerate(zip(test_prompts, test_ids)):
    print(f"Generating {i+1}/{len(test_prompts)}...")
    conv = generate_conversation(prompt)
    generated_convs.append({"id": sid, "conversation": conv})

In [None]:
# Cell 13 – Save & Download
with open("generated_test.json", "w", encoding="utf-8") as f:
    json.dump(generated_convs, f, indent=2, ensure_ascii=False)
print("✅ File saved as generated_test.json")
from google.colab import files
files.download("generated_test.json")
!cat generated_test.json

In [None]:
import json
import re


with open("generated_test.json", "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_conversation(text):

    marker = "\n\nClient:"
    if marker in text:

        cleaned = "Client:" + text.split(marker, 1)[1]
    else:

        if "Client:" in text:
            cleaned = text[text.index("Client:"):]
        else:
            cleaned = text  # fallback


    cleaned = cleaned.split("#")[0].strip()
    return cleaned


for item in data:
    item["conversation"] = clean_conversation(item["conversation"])


with open("generated_test_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("Cleaned file saved as 'generated_test_cleaned.json'")


from google.colab import files
files.download("generated_test_cleaned.json")
!cat generated_test_cleaned.json