In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Youtube

In [7]:
from youtube_transcript_api import YouTubeTranscriptApi


def get_youtube_script(video_url):
    try:
        video_id = video_url.split("v=")[1]
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        script = ' '.join([entry['text'] for entry in transcript])
        return script
    except Exception as e:
        print(f"Error: {e}")
        return None


# Example usage:
youtube_url = "https://www.youtube.com/watch?v=JEBDfGqrAUA"
script = get_youtube_script(youtube_url)

if script:
    print(script)
else:
    print("Failed to retrieve script.")

in this course I'll teach you how to use Vector search and embeddings to easily combine your data with large language models like GPT 4 first I'll teach you about the concepts and then I'll guide you through developing three projects in the first project we'll build a semantic search feature to find movies using natural language queries for this we'll use Python machine learning models and at list Vector search next we'll create a simple question answering app that uses the rag architecture and Alis Vector search to answer questions using your own data and in the final project will modify a chat GPT clone so it answers questions about contributing to the freeco camp.org curriculum based on the official documentation and if you like you can use your own data or documentation the first two examples use Python and the third uses JavaScript but you should be able to follow along with just a basic knowledge of either mongodb provided a grant that made this course possible their Atlas Vector

# Preprocess

In [2]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
from spellchecker import SpellChecker


def preprocess_text(text):
    # Convert to lowercase and remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text.lower())

    # Tokenize, lemmatize, and remove stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Additional custom stopwords
    custom_stopwords = []

    tokens = [lemmatizer.lemmatize(token, pos='v') for token in word_tokenize(
        text) if token.lower() not in (stop_words | set(custom_stopwords))]

    # Remove specific words and replace specified words
    words_to_remove = ["ill", "easily", "well", "lets", "usually", "basically", "basic", "okay", "just", "really", "simply",
                       "literally", "quite", "actually", "definitely", "totally", "seriously", "probably", "absolutely", "hopefully", "clearly"]
    tokens = [token for token in tokens if token not in words_to_remove]

    replaced_words = {"leverage": "use", "aai": "ai"}
    tokens = [replaced_words.get(token, token) for token in tokens]

    # Part-of-speech tagging and filter out non-meaningful words
    tagged_tokens = pos_tag(tokens)
    meaningful_tokens = [word for word,
                         pos in tagged_tokens if pos in ['NN', 'VB', 'JJ', 'RB']]

    # Word frequency analysis
    word_frequencies = Counter(meaningful_tokens)

    # Remove very high and very low-frequency words
    meaningful_tokens = [word for word in meaningful_tokens if 1 <
                         word_frequencies[word] < len(meaningful_tokens)/2]

    # Remove very short words
    meaningful_tokens = [word for word in meaningful_tokens if len(word) > 2]

    # Join the meaningful tokens back into a string
    processed_text = ' '.join(meaningful_tokens)

    return processed_text


def process_input_file(input_filename, output_filename):
    try:
        # Read input from input file
        with open(input_filename, 'r', encoding='utf-8') as file:
            input_text = file.read()

        # Preprocess the text
        processed_text = preprocess_text(input_text)

        # Write processed text to output file
        with open(output_filename, 'w', encoding='utf-8') as file:
            file.write(processed_text)

        print(f"Processing completed. Output written to {output_filename}")

    except Exception as e:
        print(f"Error: {e}")


# Example usage:
input_file = "input.txt"
output_file = "output.txt"
process_input_file(input_file, output_file)

Processing completed. Output written to output.txt


# Process Youtube Script

In [8]:
def process_youtube_url(video_url, output_filename):
    script = get_youtube_script(video_url)

    if script:
        processed_text = preprocess_text(script)

        # Write processed text to output file
        with open(output_filename, 'w', encoding='utf-8') as file:
            file.write(processed_text)

        print(f"Processing completed. Output written to {output_filename}")
    else:
        print("Failed to retrieve script.")


# Example usage:
youtube_url = "https://www.youtube.com/watch?v=vGP4pQdCocw"
output_file = "youtube_processed.txt"
process_youtube_url(youtube_url, output_file)

Processing completed. Output written to youtube_processed.txt


In [5]:
from langchain.document_loaders.text import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_loader = TextLoader("./data/youtube_input.txt")
text = text_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=12500, chunk_overlap=2000)
docs = text_splitter.split_documents(text)

output = [doc.page_content for doc in docs]
output_splitted = "\n\n".join(output)

with open("./data/youtube_output.txt", 'w', encoding='utf-8') as file:
    file.write(output_splitted)