# Data Collection

## Install Dependencies and import





In [None]:
!pip install pytube google-api-python-client pandas slugify tqdm requests huggingface_hub sentence-transformers faiss-cpu unstructured -q

In [None]:
from googleapiclient.discovery import build
import os
import re
import unicodedata
import pandas as pd
from pytube import YouTube

## Setup Directories

In [None]:
audio_dir = "audio_files"
transcript_dir = "transcripts"

os.makedirs(audio_dir, exist_ok=True)
os.makedirs(transcript_dir, exist_ok=True)

## Retrieving YT Channel videos

In [None]:
#YT API Client
api_key = "YT API"
youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
# Replace CHANNEL_ID with the actual ID of the YouTube channel
channel_id = "UCPjNBjflYl0-HQtUvOx0Ibw" #Greg Isenberg YT Channel

# Retrieve the channel's uploaded videos
channel_videos = []
next_page_token = None

while True:
    pl_request = youtube.search().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50,
        pageToken=next_page_token,
        type="video"
    )
    pl_response = pl_request.execute()

    channel_videos.extend(pl_response["items"])

    next_page_token = pl_response.get("nextPageToken")
    if not next_page_token:
        break

## Extract Video IDs and Titles

In [None]:
# Initialize lists to store video IDs and titles
video_ids = []
video_titles = []

# Iterate over each video in the channel
for video in channel_videos:
    # Extract video ID and append to the list
    video_id = video['id']['videoId']
    video_ids.append(video_id)

    # Extract video title and append to the list
    video_title = video['snippet']['title']
    video_titles.append(video_title)

# Create a DataFrame with video IDs and titles
video_df = pd.DataFrame({'Video ID': video_ids, 'Video Title': video_titles})

# Display the DataFrame
print(video_df)

## Download Audio Files

In [None]:
from pytube import YouTube
import pandas as pd
import slugify
import os
from tqdm import tqdm
from pytube.exceptions import VideoUnavailable

In [None]:
# Function to download high-quality WAV audio from YouTube video
def download_high_quality_audio(video_id, video_title):
    try:
        yt = YouTube(f"https://www.youtube.com/watch?v={video_id}")
        audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').order_by('abr').desc().first()
        audio_stream.download(output_path="/content/audio_files", filename=f"{slugify.slugify(video_title)}.wav")
    except VideoUnavailable:
        print(f"The video with ID {video_id} is unavailable.")

In [None]:
# Create the directory if it doesn't exist
os.makedirs("/content/audio_files", exist_ok=True)

# Assuming you have a dataframe video_df with video_ids and video_titles
for index, row in tqdm(video_df.iterrows(), total=len(video_df), desc="Downloading Audio"):
    video_id = row['Video ID']
    video_title = row['Video Title']
    download_high_quality_audio(video_id, video_title)

In [None]:
#in-case you need to delete the directory and do it again. I was there. :)

# Define the path to the folder
folder_path = '/content/audio_files'

# Check if the folder exists
if os.path.exists(folder_path):
    # Delete the folder and its contents
    shutil.rmtree(folder_path)
    print("Folder 'audio_files' deleted successfully.")
else:
    print("Folder 'audio_files' does not exist.")

## Transcribing Audio Files
Get your free Assembly AI or paid.

In [None]:
!pip install assemblyai

In [None]:
#Transcription with confidence scores, speaker labels etc.
import requests
import time
import json
import os

# Your AssemblyAI API key
api_key = 'ASSEMBLY API KEY'

def transcribe_audio(file_path):
    # Upload the audio file
    with open(file_path, 'rb') as f:
        upload_response = requests.post(
            'https://api.assemblyai.com/v2/upload',
            headers={'authorization': api_key},
            data=f
        ).json()

    # Transcribe the audio file
    transcript_response = requests.post(
        'https://api.assemblyai.com/v2/transcript',
        headers={'authorization': api_key},
        json={
            'audio_url': upload_response['upload_url'],
            'iab_categories': True,
            'speaker_labels': True
        }
    ).json()

    # Poll for the transcript to be ready
    while True:
        result_response = requests.get(
            f"https://api.assemblyai.com/v2/transcript/{transcript_response['id']}",
            headers={'authorization': api_key}
        ).json()
        if result_response['status'] == 'completed':
            return result_response
        elif result_response['status'] == 'error':
            raise Exception(f"Transcription failed: {result_response}")
        time.sleep(10)

# Use the function
file_path = '/content/audio_files/elons-perfect-reaction-to-charlie-mungers-takedown.wav'
result = transcribe_audio(file_path)

# Save the result to a file
os.makedirs("/content/transcripts", exist_ok=True)
with open(f"/content/transcripts/{os.path.basename(file_path)}.json", 'w') as f:
    json.dump(result, f)

print("Transcription saved successfully!")

## Transcribing a Single Audio File (Text Only)

In [None]:
#transcription with texts only for individual file
import requests
import time
import json
import os

# Your AssemblyAI API key
api_key = 'ASSEMBLY API KEY'

def transcribe_audio(file_path):
    # Upload the audio file
    with open(file_path, 'rb') as f:
        upload_response = requests.post(
            'https://api.assemblyai.com/v2/upload',
            headers={'authorization': api_key},
            data=f
        ).json()

    # Transcribe the audio file
    transcript_response = requests.post(
        'https://api.assemblyai.com/v2/transcript',
        headers={'authorization': api_key},
        json={
            'audio_url': upload_response['upload_url'],
            'iab_categories': True,
            'speaker_labels': True
        }
    ).json()

    # Poll for the transcript to be ready
    while True:
        result_response = requests.get(
            f"https://api.assemblyai.com/v2/transcript/{transcript_response['id

## Transcription with Speaker Labels

In [None]:
import requests
import time
import json
import os
import concurrent.futures

# Your AssemblyAI API key
api_key = '60dceb143b3f4c898e1f2c70637f5d44'

def transcribe_audio(file_path):
    # Upload the audio file
    with open(file_path, 'rb') as f:
        upload_response = requests.post(
            'https://api.assemblyai.com/v2/upload',
            headers={'authorization': api_key},
            data=f
        ).json()

    # Transcribe the audio file
    transcript_response = requests.post(
        'https://api.assemblyai.com/v2/transcript',
        headers={'authorization': api_key},
        json={
            'audio_url': upload_response['upload_url'],
            'iab_categories': True,
            'speaker_labels': True
        }
    ).json()

    # Poll for the transcript to be ready
    while True:
        result_response = requests.get(
            f"https://api.assemblyai.com/v2/transcript/{transcript_response['id']}",
            headers={'authorization': api_key}
        ).json()
        if result_response['status'] == 'completed':
            return file_path, result_response['text']
        elif result_response['status'] == 'error':
            print(f"Transcription failed for {file_path}: {result_response}")
            return file_path, None
        time.sleep(10)

# Directory with the audio files
audio_dir = '/content/audio_files'

# Directory to save the transcriptions
transcript_dir = '/content/transcripts'
os.makedirs(transcript_dir, exist_ok=True)

# Create a list of all audio file paths
audio_files = [os.path.join(audio_dir, filename) for filename in os.listdir(audio_dir) if filename.endswith('.wav')]

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start transcription of all audio files and get an iterator of futures
    futures = {executor.submit(transcribe_audio, file_path) for file_path in audio_files}

    for future in concurrent.futures.as_completed(futures):
        file_path, transcription = future.result()
        print(f"Transcription completed for {file_path}")

        # Only save the transcription if it was successful
        if transcription is not None:
            # Save the transcription to a file
            transcript_file_path = os.path.join(transcript_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}.txt")
            with open(transcript_file_path, 'w') as f:
                f.write(transcription)

print("All transcriptions saved successfully!")


# Data PreProcessing

##  Install Dependencies

In [None]:
#install dependencies
%%bash
pip install haystack-ai
pip install "sentence-transformers>=2.2.0" "huggingface_hub>=0.22.0" transformers
pip install markdown-it-py mdit_plain pypdf
pip install gdown
pip install --upgrade --quiet langchain-googledrive unstructured -q

Collecting haystack-ai
  Downloading haystack_ai-2.0.1-py3-none-any.whl (266 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 266.7/266.7 kB 4.3 MB/s eta 0:00:00
Collecting boilerpy3 (from haystack-ai)
  Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)
Collecting haystack-bm25 (from haystack-ai)
  Downloading haystack_bm25-1.0.2-py2.py3-none-any.whl (8.8 kB)
Collecting lazy-imports (from haystack-ai)
  Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)
Collecting openai>=1.1.0 (from haystack-ai)
  Downloading openai-1.20.0-py3-none-any.whl (292 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 292.8/292.8 kB 8.8 MB/s eta 0:00:00
Collecting posthog (from haystack-ai)
  Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.3/41.3 kB 2.6 MB/s eta 0:00:00
Collecting httpx<1,>=0.23.0 (from openai>=1.1.0->haystack-ai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 75.6/75.6 kB 7.9 M

## Download Data

Here we download necessary files from a Google Drive folder but you can changeup the link to anything you like. The current folder contains the YT transcripts we transcribed earlier.

In [None]:
#Download all files
import gdown

url = "https://drive.google.com/drive/folders/1M8qvR0hTYH-qP_U43EbVhj5aFE_u8djW?usp=sharing"
output_dir = "transcripts"

gdown.download_folder(url, quiet=True, output=output_dir, remaining_ok=True)

['transcripts/1-hack-to-distribute-your-social-media-content-with-chris-josephs-autopilot.txt',
 'transcripts/1-way-to-instantly-boost-productivity.txt',
 'transcripts/3-non-obvious-networking-strategies-that-work.txt',
 'transcripts/3-things-they-dont-tell-you-about-solopreneurship.txt',
 'transcripts/4-steps-to-become-a-multipreneur.txt',
 'transcripts/5-signs-multipreneurship-isnt-for-you.txt',
 'transcripts/7-tools-i-used-to-build-a-million-dollar-business.txt',
 'transcripts/30m-by-age-19-and-where-you-should-build-today.txt',
 'transcripts/50-realizations-that-changed-my-life.txt',
 'transcripts/ai-will-make-us-relearn-everything-we-do-explains-dave-rogenmoser-jasper-co-founder.txt',
 'transcripts/alexis-ohanian-explains-the-importance-of-minimum-viable-community.txt',
 'transcripts/alexis-ohanians-5-year-predictions-where-it-happens-podcast.txt',
 'transcripts/all-teams-need-a-nerd-in-residence-explains-theo-tabah-late-checkout.txt',
 'transcripts/amazons-unfair-advantage-with-a

## Index Documents

 Here we setup the pipeline for indexing documents (PDF, TXT, and MD files) and storing them in an in-memory document store.

In [None]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()

#remove whitespace
document_cleaner = DocumentCleaner()
#breaks text into chunks with overlap to avoid missing context
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

#Embeddings from documents
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store)

# RAG Pipeline

## Index Pipeline
Here we index the pipeline by connecting the components and running the pipeline.

In [None]:
#indexing pipeline
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router")
preprocessing_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
preprocessing_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")
preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

#connecting the pipeline
preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
preprocessing_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("markdown_converter", "document_joiner")
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

from pathlib import Path

preprocessing_pipeline.run({"file_type_router": {"sources": list(Path(output_dir).glob("**/*"))}})

## Question Answering Pipeline

Here we use the Mistral-7B-Instruct-v0.1 model but you can replace it anything you like.

In [None]:
import os
from getpass import getpass

if "HF_API_TOKEN" not in os.environ:
    os.environ["HF_API_TOKEN"] = getpass("Enter Hugging Face token:")

from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import HuggingFaceTGIGenerator

template = """
Answer the questions based on the given context.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ question }}
Answer:
"""
pipe = Pipeline()
pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
pipe.add_component("prompt_builder", PromptBuilder(template=template))
pipe.add_component("llm", HuggingFaceTGIGenerator("mistralai/Mistral-7B-Instruct-v0.1"))

pipe.connect("embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

## Chat with the Data

In [None]:
question = (
    "Give me 10 startup ideas"
)

pipe.run(
    {
        "embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_new_tokens": 350}},
    }
)