In [1]:
from moviepy.editor import VideoFileClip
import os

def convert_mp4_to_mp3(input_file, output_file):
    try:
        print(f"Trying to conversion from {input_file} to {output_file}")

        video_clip = VideoFileClip(input_file)
        audio_clip = video_clip.audio
        audio_clip.write_audiofile(output_file)
        audio_clip.close()
        video_clip.close()
        print(f"Conversion from {input_file} to {output_file} successful.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    #api_key=os.environ.get("OPENAI_API_KEY"),
    base_url=os.environ.get("OPENAI_EC_URL"),
    api_key=os.environ.get("EC_API_KEY"),

)

In [None]:
import assemblyai as aai 

def transcribe_audio(audio_path):
    aai.settings.api_key = os.environ.get("AAI_KEY")
    config = aai.TranscriptionConfig(language_code="zh")
    transcriber = aai.Transcriber()
    transcript = transcriber.transcribe(audio_path, config = config)
    if transcript.status == aai.TranscriptStatus.error:
        print(transcript.error)
    else:
        return transcript.text

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Example usage
text = "tiktoken is great!"
num_tokens = num_tokens_from_string(text, "cl100k_base")
print(f"Number of tokens: {num_tokens}")

In [None]:
def refine_transcription(text):

    prompt = f'''
    As an expert in transcription and editing, refine the transcription --- \n\n{text}\n\n --- by 
    adding proper punctuation and capitalization. Segment the following text into coherent 
    questions and answers, format like qeustion:question new line  answer:answer 2 new lines.
    '''
    # print(prompt)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": [
                    {
                    "type": "text",
                    "text": "You are an expert in transcription and editing"
                }
                ]
            },    
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"{prompt}"
                    }
                ]
            }
        ],
        temperature=0,
        max_tokens=4000
    )

    #print (f"{response}")
    return response.choices[0].message.content

In [None]:
def save_text(text, output_path):
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(text)


def process_video(vpath, apath):
    
    print("mp4 to mp3")
    convert_mp4_to_mp3(vpath, apath)
    output_path = os.getcwd() + "\\class"

    print(f"init trans {apath =}")
    initial_transcription = transcribe_audio(apath)
    save_text(initial_transcription, output_path + "\\init_trans.txt")

    print(f"refine trans ")
    with open( output_path + "\\init_trans.txt", 'r') as f:
        content = f.read()
    initial_transcription = content
    
    num_tokens = num_tokens_from_string(initial_transcription,"cl100k_base")
    print(f"{num_tokens =}")

    refined_text = refine_transcription(initial_transcription)
    save_text(refined_text, output_path + "\\refined.txt")

In [None]:
vpath = os.getcwd() + "\\class\\09-faq.mp4"
apath = os.getcwd() + "\\class\\09-faq.mp3"

process_video(vpath,apath)