In [1]:
import os
import json
from collections import defaultdict
from openai import OpenAI

In [3]:
# Store all loaded data
all_data = []
stats = defaultdict(int)

root_directory = '.'

for root, dirs, files in os.walk(root_directory):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Add metadata for tracking
                data['_source_file'] = file_path
                data['_source_dir'] = root
                all_data.append(data)
                
                # Collect basic stats
                stats['total_files'] += 1
                stats['total_records'] += len(data) if isinstance(data, list) else 1
                
                print(f"✓ Loaded {file_path} ({type(data).__name__})")
                
            except Exception as e:
                print(f"✗ Error {file_path}: {e}")

print(f"\nSummary: {stats['total_files']} JSON files, {stats['total_records']} total records")


✓ Loaded ./raw_zips/Copy OSA - Parity 2.0 1/data.json (dict)
✓ Loaded ./raw_zips/Misc)/data.json (dict)
✓ Loaded ./raw_zips/Startek_Compassion- Gift Donation. BEG/data.json (dict)
✓ Loaded ./raw_zips/Sales U - Closing Conclusion/data.json (dict)
✓ Loaded ./raw_zips/Startek_Telco_Visual_BEG_Demo /data.json (dict)
✓ Loaded ./raw_zips/Arise_Disney Resorts Reservations - Booking Multiple Rooms Package Reservation ~ ADV 2/data.json (dict)
✓ Loaded ./raw_zips/Arise_Carnival DS - Sales Inquiries - Booking created ~ BEG/data.json (dict)
✓ Loaded ./raw_zips/H21 Health insurance coverage question – Step 1 (Opening)/data.json (dict)
✓ Loaded ./raw_zips/TRAINEE STARTS HERE_ How did you do_ View your Playback!  /data.json (dict)
✓ Loaded ./raw_zips/H21 Health insurance coverage question – Step 2 (Gather Information)/data.json (dict)
✓ Loaded ./raw_zips/UCSL eApp – State Specifics/data.json (dict)
✓ Loaded ./raw_zips/Demo sym - Prescription Request__BEG - Screen Navigation/data.json (dict)
✓ Loaded 

In [4]:
audioContentItems_extracted = []
for item in all_data:
    audioContentItems_extracted.append(item['audioContentItems'])


In [5]:
# Task 2: Merge dialogue text
merged_transcripts = []

for items in audioContentItems_extracted:
    # 1. Sort by sequenceNumber to ensure the conversation flow is correct
    sorted_items = sorted(items, key=lambda x: x.get('sequenceNumber', 0))
    
    # 2. Format each line as "Actor: Text"
    conversation_parts = []
    for entry in sorted_items:
        actor = entry.get('actor', 'Unknown')
        
        if actor == 'TRAINEE':
            actor = 'Trainee'
            
        text = entry.get('fileTranscript', '')
        conversation_parts.append(f"{actor}: {text}")
    
    # 3. Join all parts into one continuous string
    full_transcript = " ".join(conversation_parts)
    merged_transcripts.append(full_transcript)

In [6]:
merged_transcripts[0]

"Trainee: Thank you for calling Northwestern Mutual, Underwriting Case Management, this is ________. How may I help you? SYM: Hi! This is Norm Weston calling from John Barrett's office. This case for Marty Juana has taken forever, what is the underwriter waiting on? Trainee: Hi Norm, may I start with a policy number to get started? SYM: This case for Marty Juana has taken forever, what is the underwriter waiting on? The policy number is T195129. Trainee: Thank you, Norm. Give me one moment to pull up the case. Trainee: Thank you, Norm, I do have Marty's case pulled up here. You would like to talk about what will move this case to approval, is that correct? SYM: Yes, that is what I'm looking for. Trainee: Please give me a moment so I can review the details of the case and see what we can do to get this moving. SYM: Sounds good! Trainee: Thank you so much, I am all up to speed on this case. It looks like we are awaiting records from Dr. Smith's office and that a special authorization was

In [6]:
from transformers import pipeline

In [7]:
extractor = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps:0


In [7]:
def create_prompt(transcript):
    prompt = f"""<|user|>
Read the following customer service transcript carefully.

TRANSCRIPT:
{transcript}

TASK:
1. The reason the customer called (what they need help with)
2. The Steps the agent took to help them (as a list)

Respond in JSON format:
{{
  "reason": "brief reason for the call",
  "steps": [
    "step 1",
    "step 2",
    "...",
    "step N"
  ]
}}
<|end|>
<|assistant|>"""
    return prompt

In [None]:
results = []

for i, transcript in enumerate(merged_transcripts):
    print(f"Processing transcript {i+1}...")
    full_prompt = create_prompt(transcript)

    outputs = extractor(full_prompt, max_new_tokens=500, return_full_text=False, do_sample=False)

    generated_text = outputs[0]['generated_text']

    results.append({"original_transcript": transcript,
                    "extraction": generated_text})

# Save results to JSON file
with open('reason_step.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(results)} results to reason_step.json")

Processing transcript 1...
--- Result 1 ---
 ```json
{
  "reason": "Underwriter delay on Marty Juana's case",
  "steps": [
    "Trainee greets the caller and asks for the policy number",
    "Trainee pulls up the case using the provided policy number",
    "Trainee confirms the caller's request to expedite the case",
    "Trainee reviews the case details and identifies the need for records from Dr. Smith's office and a special authorization",
    "Trainee checks with the APS vendor and finds the authorization was sent to the caller's office",
    "Trainee resends the authorization to the client's office",
    "Trainee requests a risk review from the underwriter and informs the caller of the next steps",
    "Trainee offers further assistance and thanks the caller for their trust in Northwestern Mutual"
  ]
}
```


Processing transcript 2...
--- Result 2 ---
 ```json
{
  "reason": "Customer wants to return a bed frame due to color preference and is within the return period.",
  "steps":

KeyboardInterrupt: 

In [13]:
##OpenAI version
client = OpenAI()

results = []

for i, transcript in enumerate(merged_transcripts):
    print(f"Processing transcript {i+1}...")
    full_prompt = create_prompt(transcript)

    response = client.chat.completions.create(
        model="gpt-5-nano-2025-08-07",  
        messages=[
            {"role": "user", "content": full_prompt}
        ]
    )

    generated_text = response.choices[0].message.content

    results.append({
        "original_transcript": transcript,
        "extraction": generated_text
    })

# Save results to JSON file
with open('reason_step_openai.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(results)} results to reason_step_openai.json")

Processing transcript 1...
Processing transcript 2...
Processing transcript 3...
Processing transcript 4...
Processing transcript 5...
Processing transcript 6...
Processing transcript 7...
Processing transcript 8...
Processing transcript 9...
Processing transcript 10...
Processing transcript 11...
Processing transcript 12...
Processing transcript 13...
Processing transcript 14...
Processing transcript 15...
Processing transcript 16...
Processing transcript 17...
Processing transcript 18...
Processing transcript 19...
Processing transcript 20...
Processing transcript 21...
Processing transcript 22...
Processing transcript 23...
Processing transcript 24...
Processing transcript 25...
Processing transcript 26...
Processing transcript 27...
Processing transcript 28...
Processing transcript 29...
Processing transcript 30...
Processing transcript 31...
Processing transcript 32...
Processing transcript 33...
Processing transcript 34...
Processing transcript 35...
Processing transcript 36...
P

'{\n  "reason": "The customer wanted to set up an automated birthday gift for their sponsored child, as no birthday gift automation was currently configured and the customer believed it should already be in place.",\n  "steps": [\n    "Greeted the customer and collected sponsor information, then verified the account name (Kim Reeves) and sponsor number.",\n    "Opened the account and reviewed gift setup, finding only the monthly sponsorship automatic withdrawal and no automated birthday gift.",\n    "Explained that a birthday gift automation was not configured and discussed the need to set one up.",\n    "Verified the payment method on file (Visa ending in 5252) with the customer.",\n    "Configured an automated birthday gift of $40 annually and processed a one-time birthday gift today for the missed date.",\n    "Discussed timing and recipient details: gifts take 2–3 months to reach the child, clarified which child (Peru and Rwanda) and reviewed past gift activity (one automated gift 