In [None]:
import openai

In [None]:
from pathlib import Path

In [None]:
import os

## Identifying speakers

In [None]:
import assemblyai as aai
aai_api_file = Path('../.assemblyai')
transcript = None

In [None]:
aai.settings.api_key = aai_api_file.read_text()

In [None]:
str(audio_file)

In [None]:
FILE_URL = str(audio_file)

In [None]:
config = aai.TranscriptionConfig(speaker_labels=True)


In [None]:
transcriber = aai.Transcriber()
if not transcript:
  transcript = transcriber.transcribe(
    FILE_URL,
    config=config
  )


### if you already have a transcript...
then you can use the following code to fetch it from the assembly AI API

In [None]:
existing_transcript_id = 'f136391e-caad-415a-abe3-75b652e2e848'
client = aai.Client()
ts = aai.Transcript.get_by_id(existing_transcript_id)
ts.utterances[0]

Write the speaker-specific lines out to a file

In [None]:
output_file = Path(audio_file.name + '.assemblyai.out')

with output_file.open('a') as f:
  for utterance in ts.utterances:
    f.write(f"{utterance.speaker}::{utterance.text}\n")


## Running the above transcript through openai's GPT-4

In [None]:
openai_key = Path('../.openai')
os.environ['OPENAI_API_KEY'] = openai_key.read_text()
openai_client = openai.Client()

In [None]:
prompt = "You are a knowledgable software architect. You're reviewing a transcript of two other knowledgable software architects that are having a discussion. Your job is to copy-edit this transcript. Get rid of the filler words (um, like), get rid of repeated words and fix the punctuation. After you're done copy-editing, please provide your summary at the end. Also list out what you believe the next steps to be. Thanks!"

In [None]:
output_file.read_text().split('\n')[:10]

In [None]:
# Params for post-processing
model = 'gpt-3.5-turbo'
# model = 'gpt-4-turbo'
# you'll need to set this parameter based on whatever model you select. the models have different token lengths,
# which you can find here: https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4
tokens_per_model = {
    'gpt-3.5-turbo': 16_385//4,
    'gpt-4-turbo': 30_000,
}
max_tokens = tokens_per_model[model]
temperature=0
gpt4_postprocessing_file = Path(str(output_file.absolute()) + model)

In [None]:
# Determine how many tokens are in the proposed input file
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
import tiktoken


In [None]:
from typing import List, Tuple
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model(model)
# split up the input based on the maximum token length for the model
splits: List[Tuple[int, str]] = []
split: List[str] = []
total_tokens = 0
for line_num, line in enumerate(output_file.read_text().split('\n')):
    ntokens = len(enc.encode(line))
    if total_tokens + ntokens > max_tokens:
        print(f'making a split at {line_num+1}')
        splits.append((total_tokens, split))
        total_tokens = ntokens
        split = []

    split.append(line)
    total_tokens += ntokens

if split:
    splits.append((total_tokens, split))



In [None]:
for idx, (ntokens, split) in enumerate(splits):
    print(f'split: {idx+1} has {ntokens} tokens and {len(split)} lines of conversation')

In [None]:
responses = []
for idx, split in enumerate(splits):
    tokens, lines = split
    print(f'processing line {idx+1} / {len(splits)}')
    if idx == 0:
        message_content = "This is the beginning of the conversation."
    else:
        prev_tokens, prev_lines = splits[idx-1]
        message_content = "Here is the last few minutes of conversation. Please use this as context for the next set of transcript that you're going to receive. Do not provide edits on the content of this message. Thanks!\n" + '\n'.join(prev_lines)

    messages = [
        {
            "role": "system",
            "content": prompt
        },
        {
            "role": "user",
            "content": message_content
        },
        {
            "role": "user",
            "content": "This is the part of the transcript that I want you to edit. Please only edit this part and return it as your response. Thanks!:" + '\n'.join(lines)
        }
    ]

    response = openai_client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=messages
    )
    responses.append(response)



In [None]:
with open('try9', 'w') as f:
    f.write(f'Prompt: {prompt}\n')
    f.write(f'Model: {model}\n')
    for idx, response in enumerate(responses):
        f.write('-----------------------------------------------\n')
        f.write(f'RESPONSE: {idx+1}/{len(splits)}\n')
        f.write('\n\n'.join(response.choices[0].message.content.split('\n')))