In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
from spellchecker import SpellChecker

# Youtube

In [None]:

def get_youtube_script(video_url):
    try:
        video_id = video_url.split("v=")[1]
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        script = ' '.join([entry['text'] for entry in transcript])
        return script
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
# # Example usage:
# youtube_url = "https://www.youtube.com/watch?v=RYZ0FMAKRFs"
# script = get_youtube_script(youtube_url)

# if script:
#     print(script)
# else:
#     print("Failed to retrieve script.")

# Preprocess

In [None]:
def preprocess_text(text):
    # Convert to lowercase and remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text.lower())

    # Tokenize, lemmatize, and remove stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Additional custom stopwords
    custom_stopwords = []

    tokens = [lemmatizer.lemmatize(token, pos='v') for token in word_tokenize(
        text) if token.lower() not in (stop_words | set(custom_stopwords))]

    # Remove specific words and replace specified words
    words_to_remove = ["ill", "easily", "well", "lets", "usually", "basically", "basic", "okay", "just", "really", "simply",
                       "literally", "quite", "actually", "definitely", "totally", "seriously", "probably", "absolutely", "hopefully", "clearly"]
    tokens = [token for token in tokens if token not in words_to_remove]

    replaced_words = {"leverage": "use", "aai": "ai"}
    tokens = [replaced_words.get(token, token) for token in tokens]

    # Part-of-speech tagging and filter out non-meaningful words
    tagged_tokens = pos_tag(tokens)
    meaningful_tokens = [word for word,
                         pos in tagged_tokens if pos in ['NN', 'VB', 'JJ', 'RB']]

    # Word frequency analysis
    word_frequencies = Counter(meaningful_tokens)

    # Remove very high and very low-frequency words
    meaningful_tokens = [word for word in meaningful_tokens if 1 <
                         word_frequencies[word] < len(meaningful_tokens)/2]

    # Remove very short words
    meaningful_tokens = [word for word in meaningful_tokens if len(word) > 2]

    # Join the meaningful tokens back into a string
    processed_text = ' '.join(meaningful_tokens)

    return processed_text


def process_input_file(input_filename, output_filename):
    try:
        # Read input from input file
        with open(input_filename, 'r', encoding='utf-8') as file:
            input_text = file.read()

        # Preprocess the text
        processed_text = preprocess_text(input_text)

        # Write processed text to output file
        with open(output_filename, 'w', encoding='utf-8') as file:
            file.write(processed_text)

        print(f"Processing completed. Output written to {output_filename}")

    except Exception as e:
        print(f"Error: {e}")



In [None]:
# # Example usage:
# input_file = "input.txt"
# output_file = "output.txt"
# process_input_file(input_file, output_file)

# Process Youtube Script

In [None]:
def process_youtube_url(video_url, output_filename):
    script = get_youtube_script(video_url)

    if script:
        processed_text = preprocess_text(script)

        # Write processed text to output file
        with open(output_filename, 'w', encoding='utf-8') as file:
            file.write(processed_text)

        print(f"Processing completed. Output written to {output_filename}")
    else:
        print("Failed to retrieve script.")


In [None]:
# # Example usage:
# youtube_url = "https://www.youtube.com/watch?v=RYZ0FMAKRFs"
# output_file = "./data/youtube_processed.txt"
# process_youtube_url(youtube_url, output_file)

In [None]:
from langchain.document_loaders.text import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_loader = TextLoader("./data/youtube_input.txt")
text = text_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=12500, chunk_overlap=2000)
docs = text_splitter.split_documents(text)

output = [doc.page_content for doc in docs]
output_splitted = "\n\n".join(output)

with open("./data/youtube_output.txt", 'w', encoding='utf-8') as file:
    file.write(output_splitted)

# GPT

In [15]:
from tqdm import tqdm
import config

from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI

In [18]:
youtube_url = "https://www.youtube.com/watch?v=vU2S6dVf79M"
script = get_youtube_script(youtube_url)

with open("./data/llm_input.txt", 'w', encoding='utf-8') as file:
    file.write(script)

In [19]:
file_path = './data/llm_input.txt'

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content into paragraphs based on the newline character
paragraphs = content.split('\n\n')

# Remove empty paragraphs
paragraphs = [paragraph.strip()
              for paragraph in paragraphs if paragraph.strip()]

# gpt-3.5-turbo-0125, gpt-3.5-turbo-instruct
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

template = """\
Your output should use the following template:

# Keywords/Entities/Concepts/Complex Words

## Name
- Definition: Explanation, Core Meaning,  Key Features, Essential Attributes, Distinguishing Traits
- Types: Varieties, Classifications, Different Forms
- Usage: Practical Applications, Common Scenarios, Real-world Examples
- Benefits, Challenges/Limitations/Issues
- Others: Additional Insights, Miscellaneous Information, Noteworthy Details, History, Related Concepts

# Techniques

## Name
- Description: Overview of the technique. Explanation of the fundamental concept/idea behind the technique.
- Components: Breakdown of the key elements or parts involved in the technique.
- Pipeline: Stages/Steps that outline the process flow of the technique, illustrating how data or tasks move through the system.
- Implementation: Details on how to apply or integrate the technique. Recommended guidelines, strategies, Best Practices for using the technique effectively.
- Use Cases: Examples and scenarios where the technique is particularly useful.
- Advantages: Discussion of the benefits and strengths of the technique.
- Limitations: Identification of any drawbacks or constraints associated with the technique. Potential mistakes or issues to be aware of when implementing the technique.

Apply the following guidelines:
- Create a detailed summary of the YouTube video using its transcription.
- Extract important keywords from the transcript.
- Identify complex words that may be unfamiliar to the average reader.
- Extract techniques mentioned in the video.
- If a keyword and a technique share the same name, combine them into one section.
- Ensure that explanations are derived from the entire script.
- Provide a comprehensive and clear understanding of the video's content.
- Don't make it up. Only output content from the script only.

Here is the script:
{text}"""
prompt_template = PromptTemplate.from_template(template)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
chain = prompt_template | model

output_file_path = './data/llm_output.txt'
with open(output_file_path, 'a', encoding='utf-8') as f:
  for i in tqdm(range(len(paragraphs))):
    result = chain.invoke({"text": paragraphs[i]}).content
    f.writelines(result + "\n\n")

100%|██████████| 1/1 [00:04<00:00,  4.97s/it]
