In [7]:
import os
import json
from collections import defaultdict


In [8]:
# Store all loaded data
all_data = []
stats = defaultdict(int)

root_directory = '.'

for root, dirs, files in os.walk(root_directory):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Add metadata for tracking
                data['_source_file'] = file_path
                data['_source_dir'] = root
                all_data.append(data)
                
                # Collect basic stats
                stats['total_files'] += 1
                stats['total_records'] += len(data) if isinstance(data, list) else 1
                
                print(f"✓ Loaded {file_path} ({type(data).__name__})")
                
            except Exception as e:
                print(f"✗ Error {file_path}: {e}")

print(f"\nSummary: {stats['total_files']} JSON files, {stats['total_records']} total records")


✓ Loaded ./raw_zips/Copy OSA - Parity 2.0 1/data.json (dict)
✓ Loaded ./raw_zips/Misc)/data.json (dict)
✓ Loaded ./raw_zips/Startek_Compassion- Gift Donation. BEG/data.json (dict)
✓ Loaded ./raw_zips/Sales U - Closing Conclusion/data.json (dict)
✓ Loaded ./raw_zips/Startek_Telco_Visual_BEG_Demo /data.json (dict)
✓ Loaded ./raw_zips/Arise_Disney Resorts Reservations - Booking Multiple Rooms Package Reservation ~ ADV 2/data.json (dict)
✓ Loaded ./raw_zips/Arise_Carnival DS - Sales Inquiries - Booking created ~ BEG/data.json (dict)
✓ Loaded ./raw_zips/H21 Health insurance coverage question – Step 1 (Opening)/data.json (dict)
✓ Loaded ./raw_zips/TRAINEE STARTS HERE_ How did you do_ View your Playback!  /data.json (dict)
✓ Loaded ./raw_zips/H21 Health insurance coverage question – Step 2 (Gather Information)/data.json (dict)
✓ Loaded ./raw_zips/UCSL eApp – State Specifics/data.json (dict)
✓ Loaded ./raw_zips/Demo sym - Prescription Request__BEG - Screen Navigation/data.json (dict)
✓ Loaded 

In [28]:
all_data

[{'name': 'Copy OSA - Parity 2.0 1',
  'description': None,
  'agentName': 'Agent',
  'symType': 'VISUAL',
  'lang': 'en',
  'sentimentalScoreEnabled': True,
  'keywordScoreEnabled': True,
  'testModeHotspotEnabled': True,
  'isPostSurveyEnabled': False,
  'scriptInRatingModeEnabled': True,
  'ttsRequired': True,
  'voiceName': 'Jim',
  'voiceId': 'a1da97d6-800c-11ed-a9d0-02719d001499',
  'branches': [],
  'labels': [],
  'audioContentItems': [{'externalId': '39b786b7-9c5e-4d06-be43-7e0fb136a766',
    'sequenceNumber': 1,
    'fileTranscript': 'Thank you for calling Northwestern Mutual, Underwriting Case Management, this is ________. How may I help you?',
    'fileId': '',
    'actor': 'TRAINEE',
    'keywords': [{'keyword': 'thank', 'score': 1, 'alternatives': []},
     {'keyword': 'northwestern mutual,', 'score': 1, 'alternatives': []},
     {'keyword': 'how may i help you?', 'score': 1, 'alternatives': []}],
    'settings': None},
   {'externalId': '8a7567ed-b78f-4bb3-bf71-e6b92bff8

In [25]:
audioContentItems_extracted = []
for item in all_data:
    print(item['audioContentItems'])
    audioContentItems_extracted.append(item['audioContentItems'])
    


[{'externalId': '39b786b7-9c5e-4d06-be43-7e0fb136a766', 'sequenceNumber': 1, 'fileTranscript': 'Thank you for calling Northwestern Mutual, Underwriting Case Management, this is ________. How may I help you?', 'fileId': '', 'actor': 'TRAINEE', 'keywords': [{'keyword': 'thank', 'score': 1, 'alternatives': []}, {'keyword': 'northwestern mutual,', 'score': 1, 'alternatives': []}, {'keyword': 'how may i help you?', 'score': 1, 'alternatives': []}], 'settings': None}, {'externalId': '8a7567ed-b78f-4bb3-bf71-e6b92bff89cd', 'sequenceNumber': 2, 'fileTranscript': "Hi! This is Norm Weston calling from John Barrett's office. This case for Marty Juana has taken forever, what is the underwriter waiting on?", 'fileId': 'fdda3b9a-5776-497d-9b92-84f59aa7382e', 'actor': 'SYM', 'keywords': [], 'settings': None}, {'externalId': '7303057d-33e5-48d8-96bb-118e37c4cfbc', 'sequenceNumber': 3, 'fileTranscript': 'Hi Norm, may I start with a policy number to get started?', 'fileId': '', 'actor': 'TRAINEE', 'keyw

In [None]:
# Task 2: Merge dialogue text
merged_transcripts = []

for items in audioContentItems_extracted:
    # 1. Sort by sequenceNumber to ensure the conversation flow is correct
    sorted_items = sorted(items, key=lambda x: x.get('sequenceNumber', 0))
    
    # 2. Format each line as "Actor: Text"
    conversation_parts = []
    for entry in sorted_items:
        actor = entry.get('actor', 'Unknown')
        
        if actor == 'TRAINEE':
            actor = 'Trainee'
            
        text = entry.get('fileTranscript', '')
        conversation_parts.append(f"{actor}: {text}")
    
    # 3. Join all parts into one continuous string
    full_transcript = " ".join(conversation_parts)
    merged_transcripts.append(full_transcript)

In [31]:
merged_transcripts[0]

"Trainee: Thank you for calling Northwestern Mutual, Underwriting Case Management, this is ________. How may I help you? SYM: Hi! This is Norm Weston calling from John Barrett's office. This case for Marty Juana has taken forever, what is the underwriter waiting on? Trainee: Hi Norm, may I start with a policy number to get started? SYM: This case for Marty Juana has taken forever, what is the underwriter waiting on? The policy number is T195129. Trainee: Thank you, Norm. Give me one moment to pull up the case. Trainee: Thank you, Norm, I do have Marty's case pulled up here. You would like to talk about what will move this case to approval, is that correct? SYM: Yes, that is what I'm looking for. Trainee: Please give me a moment so I can review the details of the case and see what we can do to get this moving. SYM: Sounds good! Trainee: Thank you so much, I am all up to speed on this case. It looks like we are awaiting records from Dr. Smith's office and that a special authorization was