In [None]:
import json
from datetime import timedelta

In [None]:
def format_time(seconds):
    """Convert seconds to MM:SS format"""
    minutes = int(seconds // 60)
    secs = int(seconds % 60)
    return f"{minutes:02d}:{secs:02d}"

# Load the JSON file
with open('output/cnbc oxychem sales_transcription_with_vicky_ref.json', 'r') as f:
    data = json.load(f)

segments = data['segments']
print(f"Total segments: {len(segments)}")


In [4]:
# Group continuous speaker segments
grouped_segments = []
current_speaker = None
current_start = None
current_end = None
current_text = []

for segment in segments:
    speaker = segment['speaker']
    
    if speaker == current_speaker:
        # Continue the current speaker's segment
        current_end = segment['end']
        current_text.append(segment['text'])
    else:
        # Save the previous speaker's segment if exists
        if current_speaker is not None:
            grouped_segments.append({
                'speaker': current_speaker,
                'start': current_start,
                'end': current_end,
                'text': ''.join(current_text)
            })
        
        # Start a new speaker segment
        current_speaker = speaker
        current_start = segment['start']
        current_end = segment['end']
        current_text = [segment['text']]

# Don't forget the last segment
if current_speaker is not None:
    grouped_segments.append({
        'speaker': current_speaker,
        'start': current_start,
        'end': current_end,
        'text': ''.join(current_text)
    })

print(f"Grouped into {len(grouped_segments)} continuous speaker segments")


Grouped into 12 continuous speaker segments


In [None]:
# Format the output
output_lines = []

for seg in grouped_segments:
    start_time = format_time(seg['start'])
    end_time = format_time(seg['end'])
    speaker = seg['speaker']
    text = seg['text'].strip()
    
    output_lines.append(f"{start_time} - {end_time} - Speaker {speaker}")
    output_lines.append(f'"{text}"')
    output_lines.append("")  # Empty line for spacing

# Preview first few segments
print("\n".join(output_lines[:20]))


00:00 - 00:20 - Speaker A
"Well, it's official. Berkshire Hathaway is acquiring Occidental's chemical business, OxyChem. It's an all-cash transaction. It's valued at $9.7 billion. And this is Berkshire's biggest purchase since 2022. Joining us right now is Vicki Hollett. She is Occidental Petroleum CEO. And Vicki, first of all, thanks for being here this morning. We appreciate it."

00:21 - 00:23 - Speaker Vicky Hollub
"Thank you. I appreciate the opportunity, Becky."

00:23 - 00:30 - Speaker A
"Let's talk a little bit about this deal, how it came together, and $9.7 billion in cash. What do you plan to do with it?"

00:31 - 01:07 - Speaker Vicky Hollub
"Well, with the cash, we're going to take about $6.5 billion of the cash and we're going to apply it to debt reduction. And that debt reduction amount will get us to below our target debt of $15 billion that we laid out after the Crown Rock acquisition. That then will reduce our interest payments by $350 million. And that will enable us 

In [6]:
# Save to text file
output_path = 'output/cnbc oxychem sales_formatted.txt'
with open(output_path, 'w') as f:
    f.write("\n".join(output_lines))

print(f"Formatted transcription saved to {output_path}")
print(f"Total speaker segments: {len(grouped_segments)}")


Formatted transcription saved to output/cnbc oxychem sales_formatted.txt
Total speaker segments: 12
