# Youtube

In [57]:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube

def get_youtube_script(video_url: str) -> tuple[str, str]:
	try:
			# Get video ID
			video_id = video_url.split("v=")[1]
			
			# Get video title
			yt = YouTube(video_url)
			title = yt.title
			
			# Get transcript
			transcript = YouTubeTranscriptApi.get_transcript(video_id)
			script = ' '.join([entry['text'] for entry in transcript])
			
			return title, script
	except Exception as e:
			print(f"Error: {e}")
			return None, None


def save_youtube_script(video_url: str, output_file: str):
	title, script = get_youtube_script(video_url)
	if script:
		with open(output_file, 'w', encoding='utf-8') as file:
			file.write(f"{title}\n{script}\n\n")

def save_youtube_scripts(video_urls: list[str], output_file: str):
    scripts = []
    for video_url in video_urls:
        title, script = get_youtube_script(video_url)
        if title and script:
            scripts.append((title, script))
    
    with open(output_file, 'w', encoding='utf-8') as file:
        for title, script in scripts:
            file.write(f"Title: {title}\nScript: {script}\n\n" + "-"*80 + "\n\n")

In [63]:
save_youtube_scripts(
	video_urls=[
		# 'https://www.youtube.com/watch?v=os-JX1ZQwIA&pp=ygUzQ2hhdEdQVCBHdWlkZTogMTB4IFlvdXIgUmVzdWx0cyB3aXRoIEJldHRlciBQcm9tcHRz',
		# 'https://www.youtube.com/watch?v=Uz_DeqGhbjs',
		# 'https://www.youtube.com/watch?v=Gidc185wnEA',
		# 'https://www.youtube.com/watch?v=NO7eeQJIU3Y',
		# 'https://www.youtube.com/watch?v=pjcl-hjnpfA&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=99',
  
		# 'https://www.youtube.com/watch?v=QmA7S2iGBjk&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=90',
		# 'https://www.youtube.com/watch?v=C7GvPMsvSUU&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=88',
		# 'https://www.youtube.com/watch?v=ao_OZ_bzMP8&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=79',
		# 'https://www.youtube.com/watch?v=O52nKYk0lbE&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=77',
		# 'https://www.youtube.com/watch?v=XvCq4nPqE0Y&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=74',

		# 'https://www.youtube.com/watch?v=Qos2rG3zVAM&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=68',
		# 'https://www.youtube.com/watch?v=2CEgqX15a7s&pp=ygVSQUkgRXhwZXJ0IEFuc3dlcnMgUHJvbXB0IEVuZ2luZWVyaW5nIFF1ZXN0aW9ucyBGcm9tIFR3aXR0ZXIgfCBUZWNoIFN1cHBvcnQgfCBXSVJFRA%3D%3D',
		# 'https://www.youtube.com/watch?v=_ZvnD73m40o',
		# 'https://www.youtube.com/watch?v=1c9iyoVIwDs',
		# 'https://www.youtube.com/watch?v=jC4v5AS4RIM',

		# 'https://www.youtube.com/watch?v=jNNatjruXx8&pp=ygUyRGlzY292ZXIgUHJvbXB0IEVuZ2luZWVyaW5nIHwgR29vZ2xlIEFJIEVzc2VudGlhbHM%3D',
		# 'https://www.youtube.com/watch?v=AxfmzLz9xXM',
		# 'https://www.youtube.com/watch?v=zSyo6YtdLnI',
		# 'https://www.youtube.com/watch?v=aOm75o2Z5-o',
		# 'https://www.youtube.com/watch?v=lanRWSAOkrw',

		# 'https://www.youtube.com/watch?v=3jxfk6nH5qk',
		# 'https://www.youtube.com/watch?v=RASh7C9Pm7I',
		# 'https://www.youtube.com/watch?v=pmzZF2EnKaA',
		# 'https://www.youtube.com/watch?v=ydjRYmM19DY',
		# 'https://www.youtube.com/watch?v=0cf7vzM_dZ0',

		# 'https://www.youtube.com/watch?v=aOm75o2Z5-o',
		# 'https://www.youtube.com/watch?v=NBgF8Fi1dNQ',
		# 'https://www.youtube.com/watch?v=cgBVHj9DXXY',
		# 'https://www.youtube.com/watch?v=lTI4FyO0ul8',
		# 'https://www.youtube.com/watch?v=mBYu5NoXBcs&pp=ygUhQ2hhdEdQVCBQcm9tcHQgRW5naW5lZXJpbmcgQ291cnNl',

		# 'https://www.youtube.com/watch?v=jlqgGkh1wzY',
		# 'https://www.youtube.com/watch?v=kr5X3QvPzvM',
		# 'https://www.youtube.com/watch?v=Vx6VwdhDCEc',
		# 'https://www.youtube.com/watch?v=_fdGmlnR0sY',
		# 'https://www.youtube.com/watch?v=6hqExG7mL5c',

		# 'https://www.youtube.com/watch?v=AV1wk4Q4XqI',
		# 'https://www.youtube.com/watch?v=_6lv6yeltW0',
		# 'https://www.youtube.com/watch?v=oipNUWr81Ec',
		# 'https://www.youtube.com/watch?v=2zg3V66-Fzs',
		# 'https://www.youtube.com/watch?v=2sEujkZ79E4',

		# 'https://www.youtube.com/watch?v=vpWHsCTD_Yw',
		# 'https://www.youtube.com/watch?v=BDlmT1z2IJE',
		# 'https://www.youtube.com/watch?v=jC4v5AS4RIM',
		# 'https://www.youtube.com/watch?v=3jxfk6nH5qk',
		# 'https://www.youtube.com/watch?v=aq7fnqzeaPc',

		# 'https://www.youtube.com/watch?v=MSQyu-Am36U',
		# 'https://www.youtube.com/watch?v=FAC27vCMwAs',
		# 'https://www.youtube.com/watch?v=idknpGjs2-I&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=20',
		# 'https://www.youtube.com/watch?v=Kar2qfLDQ2c&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=21',
		# 'https://www.youtube.com/watch?v=9zS0Wh9WbuE&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=23',

		# 'https://www.youtube.com/watch?v=dIz32o3m9Z8&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=43',
		# 'https://www.youtube.com/watch?v=Zv0GNvzsHh4&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=44',
		# 'https://www.youtube.com/watch?v=S4GfRQ9zIj4&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=45',
		# 'https://www.youtube.com/watch?v=j320H2LFx-U&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=47',
		# 'https://www.youtube.com/watch?v=PFK5g_kxhVM&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=48',

		# 'https://www.youtube.com/watch?v=g30slJamX4c&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=53',
		'https://www.youtube.com/watch?v=zNACfPuaqaI&list=PL6o08pkcQol7-TlFJl05pEEp4hw418DmM&index=55',
		# '',
		# '',
		# '',
  ],
	output_file="./tmp/youtube_scripts.txt"
)

In [None]:
# # Example usage:
# youtube_url = "https://www.youtube.com/watch?v=RYZ0FMAKRFs"
# script = get_youtube_script(youtube_url)

# if script:
#     print(script)
# else:
#     print("Failed to retrieve script.")

In [None]:
# # Example usage:
# input_file = "input.txt"
# output_file = "output.txt"
# process_input_file(input_file, output_file)

# GPT

In [None]:
from tqdm import tqdm
import config

from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI

In [None]:
youtube_url = "https://www.youtube.com/watch?v=vU2S6dVf79M"
script = get_youtube_script(youtube_url)

with open("./data/llm_input.txt", 'w', encoding='utf-8') as file:
    file.write(script)

In [None]:
file_path = './data/llm_input.txt'

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content into paragraphs based on the newline character
paragraphs = content.split('\n\n')

# Remove empty paragraphs
paragraphs = [paragraph.strip()
              for paragraph in paragraphs if paragraph.strip()]

# gpt-3.5-turbo-0125, gpt-3.5-turbo-instruct
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

template = """\
Your output should use the following template:

# Keywords/Entities/Concepts/Complex Words

## Name
- Definition: Explanation, Core Meaning,  Key Features, Essential Attributes, Distinguishing Traits
- Types: Varieties, Classifications, Different Forms
- Usage: Practical Applications, Common Scenarios, Real-world Examples
- Benefits, Challenges/Limitations/Issues
- Others: Additional Insights, Miscellaneous Information, Noteworthy Details, History, Related Concepts

# Techniques

## Name
- Description: Overview of the technique. Explanation of the fundamental concept/idea behind the technique.
- Components: Breakdown of the key elements or parts involved in the technique.
- Pipeline: Stages/Steps that outline the process flow of the technique, illustrating how data or tasks move through the system.
- Implementation: Details on how to apply or integrate the technique. Recommended guidelines, strategies, Best Practices for using the technique effectively.
- Use Cases: Examples and scenarios where the technique is particularly useful.
- Advantages: Discussion of the benefits and strengths of the technique.
- Limitations: Identification of any drawbacks or constraints associated with the technique. Potential mistakes or issues to be aware of when implementing the technique.

Apply the following guidelines:
- Create a detailed summary of the YouTube video using its transcription.
- Extract important keywords from the transcript.
- Identify complex words that may be unfamiliar to the average reader.
- Extract techniques mentioned in the video.
- If a keyword and a technique share the same name, combine them into one section.
- Ensure that explanations are derived from the entire script.
- Provide a comprehensive and clear understanding of the video's content.
- Don't make it up. Only output content from the script only.

Here is the script:
{text}"""
prompt_template = PromptTemplate.from_template(template)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
chain = prompt_template | model

output_file_path = './data/llm_output.txt'
with open(output_file_path, 'a', encoding='utf-8') as f:
  for i in tqdm(range(len(paragraphs))):
    result = chain.invoke({"text": paragraphs[i]}).content
    f.writelines(result + "\n\n")