In [1]:
import os
import json
from collections import defaultdict


In [2]:
# Store all loaded data
all_data = []
stats = defaultdict(int)

root_directory = '.'

for root, dirs, files in os.walk(root_directory):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Add metadata for tracking
                data['_source_file'] = file_path
                data['_source_dir'] = root
                all_data.append(data)
                
                # Collect basic stats
                stats['total_files'] += 1
                stats['total_records'] += len(data) if isinstance(data, list) else 1
                
                print(f"✓ Loaded {file_path} ({type(data).__name__})")
                
            except Exception as e:
                print(f"✗ Error {file_path}: {e}")

print(f"\nSummary: {stats['total_files']} JSON files, {stats['total_records']} total records")


✓ Loaded ./raw_zips/Copy OSA - Parity 2.0 1/data.json (dict)
✓ Loaded ./raw_zips/Misc)/data.json (dict)
✓ Loaded ./raw_zips/Startek_Compassion- Gift Donation. BEG/data.json (dict)
✓ Loaded ./raw_zips/Sales U - Closing Conclusion/data.json (dict)
✓ Loaded ./raw_zips/Startek_Telco_Visual_BEG_Demo /data.json (dict)
✓ Loaded ./raw_zips/Arise_Disney Resorts Reservations - Booking Multiple Rooms Package Reservation ~ ADV 2/data.json (dict)
✓ Loaded ./raw_zips/Arise_Carnival DS - Sales Inquiries - Booking created ~ BEG/data.json (dict)
✓ Loaded ./raw_zips/H21 Health insurance coverage question – Step 1 (Opening)/data.json (dict)
✓ Loaded ./raw_zips/TRAINEE STARTS HERE_ How did you do_ View your Playback!  /data.json (dict)
✓ Loaded ./raw_zips/H21 Health insurance coverage question – Step 2 (Gather Information)/data.json (dict)
✓ Loaded ./raw_zips/UCSL eApp – State Specifics/data.json (dict)
✓ Loaded ./raw_zips/Demo sym - Prescription Request__BEG - Screen Navigation/data.json (dict)
✓ Loaded 

In [3]:
audioContentItems_extracted = []
for item in all_data:
    audioContentItems_extracted.append(item['audioContentItems'])


In [4]:
# Task 2: Merge dialogue text
merged_transcripts = []

for items in audioContentItems_extracted:
    # 1. Sort by sequenceNumber to ensure the conversation flow is correct
    sorted_items = sorted(items, key=lambda x: x.get('sequenceNumber', 0))
    
    # 2. Format each line as "Actor: Text"
    conversation_parts = []
    for entry in sorted_items:
        actor = entry.get('actor', 'Unknown')
        
        if actor == 'TRAINEE':
            actor = 'Trainee'
            
        text = entry.get('fileTranscript', '')
        conversation_parts.append(f"{actor}: {text}")
    
    # 3. Join all parts into one continuous string
    full_transcript = " ".join(conversation_parts)
    merged_transcripts.append(full_transcript)

In [5]:
merged_transcripts[0]

"Trainee: Thank you for calling Northwestern Mutual, Underwriting Case Management, this is ________. How may I help you? SYM: Hi! This is Norm Weston calling from John Barrett's office. This case for Marty Juana has taken forever, what is the underwriter waiting on? Trainee: Hi Norm, may I start with a policy number to get started? SYM: This case for Marty Juana has taken forever, what is the underwriter waiting on? The policy number is T195129. Trainee: Thank you, Norm. Give me one moment to pull up the case. Trainee: Thank you, Norm, I do have Marty's case pulled up here. You would like to talk about what will move this case to approval, is that correct? SYM: Yes, that is what I'm looking for. Trainee: Please give me a moment so I can review the details of the case and see what we can do to get this moving. SYM: Sounds good! Trainee: Thank you so much, I am all up to speed on this case. It looks like we are awaiting records from Dr. Smith's office and that a special authorization was

In [None]:
from transformers import pipeline

In [9]:
extractor = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct")


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Device set to use mps:0


In [10]:
def create_prompt(transcript):
    prompt = f"""<|user|>
Read the following customer service transcript carefully.

TRANSCRIPT:
{transcript}

TASK:
1. Identify the primary 'reason' for the call.
2. Extract the actionable 'steps' the agent guided the customer through.

Format your response as a JSON object with keys: "reason" and "steps".
<|end|>
<|assistant|>"""
    return prompt

In [11]:
results = []

for i, transcript in enumerate(merged_transcripts[:3]):
    print(f"Processing transcript {i+1}...")
    full_prompt = create_prompt(transcript)

    outputs = extractor(full_prompt, max_new_tokens=500, return_full_text=False)

    generated_text = outputs[0]['generated_text']

    results.append({"original_transcript": transcript,
                    "extraction": generated_text})
    
    print(f"--- Result {i+1} ---")
    print(generated_text)
    print("\n" + "="*50 + "\n")


Processing transcript 1...
--- Result 1 ---
 ```json
{
  "reason": "The primary reason for the call was to inquire about the status of a health underwriting case for an individual named Marty Juana, which had been delayed, and to understand what was causing the delay.",
  "steps": [
    "Request the policy number from the caller to pull up the case.",
    "Confirm and discuss the status of the case with the caller.",
    "Check the case details for pending actions or records needed.",
    "Identify missing records from Dr. Smith's office and determine the last communication with the APS vendor.",
    "Resend the special authorization form to the caller's office.",
    "Consider requesting a risk review from the underwriter due to the delay.",
    "Ensure the caller's office receives the necessary form to prevent further delay.",
    "Confirm with the caller that the issue has been addressed and offer further assistance if needed."
  ]
}
```


Processing transcript 2...
--- Result 2 ---