# The Huberman Lab Chatbot

Created with a selection of episodes from The Hubermand Lab podcasts taken from YouTube. We pull the transcripts from them and let OpenAI create a structured report in markdown. 

Tried Ollama too, but instead of generating comprehensive reports, it created  brief summaries. Too much knowledge loss.

I was also wondering if it would be better to use full transcripts instead of structurized versions of the transcripts but chose the structured approach after reading about the pros and cons for both options.

In [None]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
from openai import OpenAI
import ollama
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, VideoUnavailable
import re

In [None]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# price is a factor for our company, so we're going to use a low cost model

OPENAI_MODEL = "gpt-4o-mini"
db_name = "vector_db"
OLLAMA_API = "http://localhost:11434/api/chat"
OLLAMA_MODEL="llama3.2"

openai = OpenAI()

In [None]:
# Load environment variables in a file called .env

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['YOUTUBE_API_KEY'] = os.getenv('YOUTUBE_API_KEY', 'your-key-if-not-using-env')

## You need to pip install some stuff

The google api python client and youtube transcript api.

You also need to a Youtube API key.

In [None]:
#!pip install google-api-python-client youtube-transcript-api


In [None]:
#!pip install --upgrade google-api-python-client pytube

In [None]:

# Load the API key from an environment variable
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')

if YOUTUBE_API_KEY is None:
    print("Error: YOUTUBE_API_KEY environment variable not set.")
    print("Please set the environment variable or ensure it's correctly loaded.")
    # Exiting here as the Google API part won't work without the key
    exit()

def get_video_id(youtube_url):
    """
    Extracts the video ID from various YouTube URL formats.
    Handles standard, shortened (youtu.be), and googleusercontent.com proxy URLs.
    """
    # Regex to capture the 11-character video ID
    # Handles:
    # - https://www.youtube.com/watch?v=VIDEO_ID
    # - https://youtu.be/VIDEO_ID
    # - https://www.youtube.com/embed/VIDEO_ID
    # - youtu.be//VIDEO_ID (common proxy format)
    video_id_match = re.search(r"(?:v=|youtu\.be\/|embed\/|youtube\.com\/)([a-zA-Z0-9_-]{11})", youtube_url)
    if video_id_match:
        return video_id_match.group(1)
    return None

def get_youtube_info_with_api(youtube_url, api_key):
    """
    Extracts video title and transcript using YouTube Data API and youtube-transcript-api.

    Args:
        youtube_url (str): The full YouTube video URL.
        api_key (str): Your YouTube Data API v3 key.

    Returns:
        tuple: A tuple containing (video_title, transcript_text) or (None, None) if an error occurs.
    """
    video_id = get_video_id(youtube_url)
    if not video_id:
        print("Error: Could not extract video ID from the URL. Please ensure it's a valid YouTube video URL format.")
        return None, None

    video_title = None
    transcript_text = None

    # --- Step 1: Get video title using YouTube Data API ---
    try:
        youtube = build("youtube", "v3", developerKey=api_key)
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()

        if response and response.get('items'):
            video_title = response['items'][0]['snippet']['title']
        else:
            print(f"Could not find video details for ID: {video_id} using YouTube Data API. It might be private, deleted, or geo-restricted.")

    except Exception as e:
        print(f"Error fetching video title with YouTube Data API: {e}")

    # --- Step 2: Get transcript using youtube-transcript-api ---
    try:
        # Attempt to get transcript in English or US English first.
        # If you know the specific language, you can specify it, e.g., languages=['es'] for Spanish.
        # If you want to try all available transcripts, you can remove the 'languages' parameter,
        # but then you'd need to iterate through the transcript list to find the one you want.
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'en-US'])
        transcript_text = " ".join([entry['text'] for entry in transcript_list])
    except NoTranscriptFound:
        print(f"No transcript found for video ID: {video_id} in the specified languages (en, en-US).")
        print("This video might not have an auto-generated or uploaded transcript, or it's in a different language.")
        transcript_text = None
    except VideoUnavailable:
        print(f"Video with ID: {video_id} is unavailable for transcript retrieval (e.g., private, deleted, or geo-restricted).")
        transcript_text = None
    except Exception as e:
        print(f"An unexpected error occurred while fetching transcript: {e}")
        transcript_text = None

    return video_title, transcript_text

# Example Usage for testing:


youtube_url = "https://www.youtube.com/watch?v=Z3OpxT65fKw" 

title, transcript = get_youtube_info_with_api(youtube_url, YOUTUBE_API_KEY)

if title:
    print(f"Video Title: {title}\n")

if transcript:
    print("--- Transcript ---")
    print(transcript[:500] + "..." if len(transcript) > 500 else transcript) # Print first 500 chars
elif title and not transcript:
    print("Transcript not available for this video (as indicated above).")
else:
    print("Failed to retrieve any video information. Please double-check the URL, your API key, and internet connection.")

## Creating the structured markdown versions of the transcripts

In [None]:
system_message = "You're an assistants that converts raw transcripts to markdown files."
system_message += "You will rewrite the entire transcript and add structure using headings, paragraphs, lists etc."
system_message += "You will make sure not to leave out any important information. This is not a summary, \
but an comprehenisve report of the conversation."

In [None]:
user_message = "Convert this transcript to structured markdown. Use the video title as h1 in resulting markdown: \n"

In [None]:
def create_markdown(youtube_url):
    video_title, transcript_text = get_youtube_info_with_api(youtube_url, YOUTUBE_API_KEY)
    messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message + video_title + transcript_text}
          ]
    completion = openai.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
    )
    return completion.choices[0].message.content


## Creating the markdown files from a list of Youtube urls

The list of urls in the list includes: 
- the guest series about exercise with Dr. Andy Galpin
- the guest series about sleep with Dr. Matthew Walker
- two episodes with Dr. Layne Norton about exercise and nutrition
- the episode with Dr. Gabrielle Lyon about exercise and nutrition
- the episode about the vagus nerve
- the episode about how to grow by doing hard things with Michael Easter
- the episode about intermittent fasting with Dr. Satchin Panda
- the episode about building strength, endurance with Pavel Tsatsouline

In [None]:
with open('youtube-video-list.txt', 'r', encoding='utf-8') as youtube_list:
    raw_lines = youtube_list.readlines()

    # To remove the newline characters and any leading/trailing whitespace
    clean_list = [line.strip() for line in raw_lines]

for youtube_url in clean_list:
    transcript_in_markdown = create_markdown(youtube_url)
    
    with open(f"Huberman/{transcript_in_markdown[2:25]}.md", 'w', encoding='utf-8') as f:
        f.write(transcript_in_markdown)

## Adding a structured report from a single Youtube url to expand the RAG database

I've added categorized folders as in the example. Not sure if this helps if you're not creating visualisations. You need to add newly created .md files to an appropriate folder or create a new folder.

In [None]:
youtube_url = 'https://www.youtube.com/watch?v=zqANjUGarAw&t=16s'

transcript_in_markdown = create_markdown(youtube_url)
    
with open(f"Huberman/{transcript_in_markdown[2:25]}.md", 'w', encoding='utf-8') as f:
    f.write(transcript_in_markdown)


In [None]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our Huberman folder

folders = glob.glob("Huberman/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

In [None]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Wrapping that in a function

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)