In [None]:
# pip install google-api-python-client youtube-transcript-api

In [None]:
from typing import List, Optional, Tuple
from youtube_transcript_api import YouTubeTranscriptApi
import re
from pathlib import Path
from loguru import logger
import urllib.request
import json

def get_video_title(video_id: str) -> Tuple[str, Optional[str]]:
    """
    Fetch video title using video ID.
    Returns tuple of (title, error_message)
    """
    try:
        # Get webpage content
        url = f"https://www.youtube.com/watch?v={video_id}"
        html = urllib.request.urlopen(url).read().decode()
        
        # Search for title in meta data
        title_match = re.search(r'<meta name="title" content="([^"]+)"', html)
        if title_match:
            return title_match.group(1), None
            
        # Alternative method: search in page title
        title_match = re.search(r'<title>([^<]+)</title>', html)
        if title_match:
            title = title_match.group(1).replace(' - YouTube', '')
            return title, None
            
        return "Unknown Title", "Could not extract title"
    except Exception as e:
        return "Unknown Title", f"Error fetching title: {str(e)}"

def extract_video_id(url: str) -> Optional[str]:
    """Extract video ID from YouTube URL."""
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_transcript(video_id: str) -> tuple[str, Optional[str]]:
    """
    Fetch transcript for a single video.
    Returns tuple of (transcript_text, error_message)
    """
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        full_text = ' '.join(item['text'] for item in transcript_list)
        return full_text, None
    except Exception as e:
        error_msg = f"Error fetching transcript for video {video_id}: {str(e)}"
        return "", error_msg

def save_transcripts(urls: List[str], output_file: str) -> None:
    """Process YouTube URLs and save their transcripts to a file."""
    
    # Create or overwrite output file
    with Path(output_file).open('w', encoding='utf-8') as f:
        for url in urls:
            # Extract video ID
            video_id = extract_video_id(url)
            if not video_id:
                logger.warning(f"Invalid YouTube URL: {url}")
                continue
                
            # Get video title
            logger.info(f"Processing video: {url}")
            title, title_error = get_video_title(video_id)
            
            # Get transcript
            transcript, transcript_error = get_transcript(video_id)
            
            # Write to file
            f.write(f"\n=== Video Information ===\n")
            f.write(f"URL: {url}\n")
            f.write(f"Title: {title}\n")
            if title_error:
                f.write(f"Title Error: {title_error}\n")
            f.write("\n=== Transcript ===\n")
            if transcript_error:
                f.write(f"Error: {transcript_error}\n")
            else:
                f.write(f"{transcript}\n")
            f.write("\n" + "="*50 + "\n")

urls = [
	"https://youtu.be/wwC86t5k77Y?si=uvSH2RJ34WE4o5wY",
	"https://youtu.be/fv1rkctrEPk?si=5xL18SO0S95U-pv9",
	"https://youtu.be/K0it90lnqnk?si=ycfjnE0sEhXT_kwX",
	"https://youtu.be/Nl6SVPpRUQI?si=TVmfJTZTzJk82eyC",
	"https://youtu.be/jnaeB6LjoQs?si=FPRySBEux7h2icIs",
	"https://youtu.be/CG-USxkH_Ho?si=jANv1CLodMNbWmPs",
	"https://youtu.be/2mY9YT2yUy4?si=ZpWhPRa1Vw1R5dzA",
	"https://youtu.be/uSMdnoTdG2E?si=-kM93wOrkr1Y31xv",
]
save_transcripts(urls, "transcripts.txt")

# Youtube

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import yaml
from tqdm import tqdm
import os

def get_youtube_script(video_url: str) -> tuple[str, str, str]:
    try:
        # Extract video ID from URL
        if "v=" in video_url:
            video_id = video_url.split("v=")[1]
        else:
            split_url = video_url.split("/")
            video_id = split_url[-1].split("?")[0]

        # Get video title and channel name
        yt = YouTube(video_url)
        title = yt.title
        channel = yt.author

        # Get transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        script = ' '.join([entry['text'] for entry in transcript])

        return title, channel, script
    except Exception as e:
        print(f"Error: {e}. video_url: `{video_url}`")
        return None, None, None


def extract_n_save_info(yaml_file: str):
    # Load the YAML file
    with open(yaml_file, 'r') as file:
        videos = yaml.safe_load(file)

    for video in tqdm(videos):
        title, channel, _ = get_youtube_script(video['url'])
        if title and channel:
            video['title'] = title
            video['channel'] = channel

    # Save the updated data back to the YAML file
    with open(yaml_file, 'w') as file:
        yaml.dump(videos, file)

def process_title(title: str) -> str:
    # Remove special characters and replace spaces with underscores
    import re
    processed_title = re.sub(r'[^\w\s-]', '', title)
    processed_title = re.sub(r'\s+', '_', processed_title)
    return processed_title.lower()

def process_videos(yaml_file: str):
    # Load the YAML file
    with open(yaml_file, 'r') as file:
        videos: list = yaml.safe_load(file)

    for video in tqdm(videos):
        if 'title' in video:
            video['title'] = process_title(video['title'])
        
        # Fetch channel info if missing
        if 'channel' not in video or video['channel'] is None:
            _, channel, _ = get_youtube_script(video['url'])
            video['channel'] = channel

    # Sort the list of videos based on the 'channel' key, handling None values
    videos.sort(key=lambda x: (x['channel'] is None, x['channel']))

    # Save the sorted data back to the YAML file
    with open(yaml_file, 'w') as file:
        yaml.dump(videos, file)


def save_youtube_script(video_url: str, output_file: str):
	title, channel, script = get_youtube_script(video_url)
	if script:
		with open(output_file, 'w', encoding='utf-8') as file:
				file.write(f"Title: {title}\nChannel: {channel}\nScript: {script}\n\n")

def save_youtube_scripts(yaml_file: str, output_file: str, duration_limit: int):
    # Split the output file into directory and file name
    directory = os.path.dirname(output_file)
    base_name, extension = os.path.splitext(os.path.basename(output_file))

    # Create the directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)

    # Load the YAML file
    with open(yaml_file, 'r') as file:
        videos = yaml.safe_load(file)

    total_duration = 0
    file_index = 0
    current_file = os.path.join(directory, f'{base_name}_{file_index}{extension}')
    file_handle = open(current_file, 'w', encoding='utf-8')

    for video in tqdm(videos):
        title, channel, script = get_youtube_script(video['url'])
        if title and channel and script:
            video_duration = video['length']
            if total_duration + video_duration > duration_limit:
                file_handle.close()
                file_index += 1
                current_file = os.path.join(directory, f'{base_name}_{file_index}{extension}')
                file_handle = open(current_file, 'w', encoding='utf-8')
                total_duration = 0

            file_handle.write(f"Title: {title}\nChannel: {channel}\nScript: {script}\n\n" + "-"*80 + "\n\n")
            total_duration += video_duration
            video['title'] = title
            video['channel'] = channel

    file_handle.close()

In [None]:
# extract_n_save_info(
#     yaml_file="tmp/script_urls/prompt_engineering.yaml",
# )

# process_videos(
# 	yaml_file="tmp/script_urls/prompt_engineering.yaml",
# )

In [None]:
save_youtube_scripts(
	yaml_file="tmp/script_urls/prompt_engineering.yaml",
	output_file="./tmp/scripts/prompt_engineering/youtube_scripts.txt",
	duration_limit=60,
)



# GPT

In [None]:
from tqdm import tqdm
import config

from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI

In [None]:
youtube_url = "https://www.youtube.com/watch?v=vU2S6dVf79M"
script = get_youtube_script(youtube_url)

with open("./data/llm_input.txt", 'w', encoding='utf-8') as file:
    file.write(script)

In [None]:
file_path = './data/llm_input.txt'

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content into paragraphs based on the newline character
paragraphs = content.split('\n\n')

# Remove empty paragraphs
paragraphs = [paragraph.strip()
              for paragraph in paragraphs if paragraph.strip()]

# gpt-3.5-turbo-0125, gpt-3.5-turbo-instruct
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

template = """\
Your output should use the following template:

# Keywords/Entities/Concepts/Complex Words

## Name
- Definition: Explanation, Core Meaning,  Key Features, Essential Attributes, Distinguishing Traits
- Types: Varieties, Classifications, Different Forms
- Usage: Practical Applications, Common Scenarios, Real-world Examples
- Benefits, Challenges/Limitations/Issues
- Others: Additional Insights, Miscellaneous Information, Noteworthy Details, History, Related Concepts

# Techniques

## Name
- Description: Overview of the technique. Explanation of the fundamental concept/idea behind the technique.
- Components: Breakdown of the key elements or parts involved in the technique.
- Pipeline: Stages/Steps that outline the process flow of the technique, illustrating how data or tasks move through the system.
- Implementation: Details on how to apply or integrate the technique. Recommended guidelines, strategies, Best Practices for using the technique effectively.
- Use Cases: Examples and scenarios where the technique is particularly useful.
- Advantages: Discussion of the benefits and strengths of the technique.
- Limitations: Identification of any drawbacks or constraints associated with the technique. Potential mistakes or issues to be aware of when implementing the technique.

Apply the following guidelines:
- Create a detailed summary of the YouTube video using its transcription.
- Extract important keywords from the transcript.
- Identify complex words that may be unfamiliar to the average reader.
- Extract techniques mentioned in the video.
- If a keyword and a technique share the same name, combine them into one section.
- Ensure that explanations are derived from the entire script.
- Provide a comprehensive and clear understanding of the video's content.
- Don't make it up. Only output content from the script only.

Here is the script:
{text}"""
prompt_template = PromptTemplate.from_template(template)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
chain = prompt_template | model

output_file_path = './data/llm_output.txt'
with open(output_file_path, 'a', encoding='utf-8') as f:
  for i in tqdm(range(len(paragraphs))):
    result = chain.invoke({"text": paragraphs[i]}).content
    f.writelines(result + "\n\n")