In [1]:
import os
import requests
from pytubefix import YouTube
import sys
import time
import re
from pprint import pprint

In [3]:
def youtube_download(url, path="."):
    yt = YouTube(url=url)
    print(yt.title)

    dl_yt_thumbnail(yt)

    dl_yt_audio_mp3(yt)

    return 0


def dl_yt_thumbnail(yt: YouTube):
    thumbnail_url = yt.thumbnail_url

    thumbnail_response = requests.get(thumbnail_url)

    if thumbnail_response.status_code == 200:
        thumbnail_file = os.path.join(".", "thumbnail.jpg")
        with open(thumbnail_file, "wb") as f:
            f.write(thumbnail_response.content)
        print("Thumbnail saved!")
    else:
        print(f"Error: {thumbnail_response.status_code}")


def dl_yt_audio_mp3(yt: YouTube):
    audio_stream = yt.streams.filter(only_audio=True).first()
    audio_dl = yt.streams.get_audio_only()
    audio_dl.download(filename="audio_test", mp3=True)

In [4]:
youtube_url = ""

In [None]:
youtube_download(youtube_url)

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [51]:
from openai import OpenAI

In [53]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
stream = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Say this is a test"}],
    stream=True,
)
for chunk in stream:
    print(chunk.choices[0].delta.content or "", end="")

In [59]:
from openai.types.audio import (
    Transcription,
    TranscriptionSegment,
    TranscriptionVerbose,
    TranscriptionWord,
    TranscriptionCreateResponse,
)

In [63]:
from pathlib import Path

In [None]:
pprint(client.models.list())

In [69]:
transcription_response = client.audio.transcriptions.create(
    file=Path("audio_test.mp3"),
    language="en",
    model="whisper-1",
)

In [None]:
transcription_response.text

In [71]:
summary_response = client.chat.completions.create(
    model="chatgpt-4o-latest",
    messages=[
        {"role": "user", "content": "Summarize the following text:"},
        {"role": "user", "content": transcription_response.text},
    ],
    stream=True,
)

In [None]:
for chunk in summary_response:
    print(chunk.choices[0].delta.content or "", end="")

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [None]:
torch.cuda.is_available()

In [10]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True,
)

In [None]:
result = pipe("audio_test.mp3")

In [None]:
pprint(result["text"])

In [13]:
import ollama

In [14]:
system_prompt = "Summarize the following transcript from a youtube video: '"
system_prompt += result["text"]

In [15]:
response = ollama.chat(
    model="llama3.1", messages=[{"role": "user", "content": system_prompt}]
)

In [None]:
pprint(response)

In [None]:
pprint(response["message"]["content"])