# Transcribe and summarize

Use openAI to transcribe and summarize audio files.

In [2]:
import os
import openai
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

## Input audio files

ffmpeg -i audio/2021-excerpt.m4a -c:v libx264 -tune zerolatency -preset ultrafast -crf 50 -c:a aac -b:a 10k audio-processed/2021-excerpt.m4a

In [9]:
# Set the directory path you want to search in
#directory_path = '/Users/arno.klein/Documents/Steinberg-TomInsel-sessions/audio-excerpts'
audio_path = '/Users/arno.klein/Documents/Giants-videos/audio/split/processed'
combined_transcripts_path = '/Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined'

# List all file paths to files ending with '.m4a'
audio_append = '.m4a'
audio_file_paths = [os.path.join(audio_path, file) for file in os.listdir(audio_path) if file.endswith(audio_append)]

## Output transcript and summary files

In [5]:
# Output transcript and summary file paths
manually_combine_transcripts = True
transcript_file_paths = [file + '_transcript.txt' for file in audio_file_paths]
summary_file_paths = [file + '_summary.txt' for file in transcript_file_paths]

## Transcribe and summarize functions

In [6]:
def transcribe(audio_file_path):

    audio_file = open(audio_file_path, "rb")
    transcript = client.audio.transcriptions.create(
      model="whisper-1", 
      file=audio_file, 
      response_format="text"
    )
    
    return transcript

In [7]:
def summarize(transcript):
    system_prompt = """Please provide a clear summary of the provided transcript, understandable for someone with a high-school education. \n\nTranscript:"""
    summary = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role":"system","content": system_prompt},
            {"role":"user","content": transcript}]
    )
    
    return summary.choices[0].message.content

## Loop through, transcribe, and summarize audio files

In [8]:
for ifile, audio_file_path in enumerate(audio_file_paths):
    print('Audio file: {0}'.format(audio_file_path))

    transcript = transcribe(audio_file_path)
    with open(transcript_file_paths[ifile], "w") as file_buffer:
        file_buffer.write(transcript)
    print('Transcript written to {0}'.format(transcript_file_paths[ifile]))
    
    if not manually_combine_transcripts:
        summary = summarize(transcript)
        with open(summary_file_paths[ifile], "w") as file_buffer:
            file_buffer.write(summary)
        print('Summary written to {0}'.format(summary_file_paths[ifile]))

Audio file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2022-2-processed.m4a
Transcript written to /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2022-2-processed.m4a_transcript.txt
Audio file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2020-3-processed.m4a
Transcript written to /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2020-3-processed.m4a_transcript.txt
Audio file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2022-1-processed.m4a
Transcript written to /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2022-1-processed.m4a_transcript.txt
Audio file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2023-1-processed.m4a
Transcript written to /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2023-1-processed.m4a_transcript.txt
Audio file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/2023-2-processed.m4a
Transcript written to /User

In [10]:
if manually_combine_transcripts:

    combined_transcript_file_paths = [os.path.join(combined_transcripts_path, file) for file in os.listdir(combined_transcripts_path) if file.endswith(".txt")]
    summary_file_paths = [file + '_summary.txt' for file in combined_transcript_file_paths]

    for ifile, combined_transcript_file_path in enumerate(combined_transcript_file_paths):
        print('Combined transcript file: {0}'.format(combined_transcript_file_path))

        with open(combined_transcript_file_path) as f:
            combined_transcript = f.read()
    
        summary = summarize(combined_transcript)
        with open(summary_file_paths[ifile], "w") as file_buffer:
            file_buffer.write(summary)
        print('Summary written to {0}'.format(summary_file_paths[ifile]))

Combined transcript file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined/2023_roundtable_transcript.txt
Summary written to /Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined/2023_roundtable_transcript.txt_summary.txt
Combined transcript file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined/2021_transcript.txt
Summary written to /Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined/2021_transcript.txt_summary.txt
Combined transcript file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined/2020_transcript.txt
Summary written to /Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined/2020_transcript.txt_summary.txt
Combined transcript file: /Users/arno.klein/Documents/Giants-videos/audio/split/processed/transcripts_combined/2022_transcript.txt
Summary written to /Users/arno.klein/Documents