In [1]:
# PODCAST Q&A BOT
import os
from langchain.llms import VertexAI
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
# from langchain.chat_models import ChatOpenAI
from langchain.chat_models import ChatVertexAI
from langchain.document_loaders import YoutubeLoader
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
# from langchain.callbacks import get_openai_callback
from langchain.callbacks import StdOutCallbackHandler
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import VertexAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [2]:
# Marc Andreessen: Future of the Internet, Technology, and AI | Lex Fridman Podcast #386
loader = YoutubeLoader.from_youtube_url(
    'https://www.youtube.com/watch?v=-hxeDjAxvJ8', add_video_info=True)
data = loader.load()

In [3]:
data



In [4]:
text_splitter_summary = TokenTextSplitter(
    chunk_size=10000, chunk_overlap=250)

In [5]:
# Split text into docs for summary
docs_summary = text_splitter_summary.split_documents(data)

In [6]:
docs_summary

[Document(page_content='- The competence and\ncapability and intelligence and training and accomplishments\nof senior scientists and technologists working on a technology, and then being able to then\nmake moral judgments in the use of the technology. That track record is terrible. That track record is catastrophically bad. The policies that are being\ncalled for to prevent this, I think we\'re gonna cause\nextraordinary damage. - So the moment you say,\nAI\'s gonna kill all of us, therefore we should ban it, or that we should regulate\nall that kind of stuff, that\'s when it starts getting serious. - Or start, you know, military\nairstrikes and data centers. - Oh boy. The following is a conversation\nwith Marc Andreessen, co-creator of Mosaic, the\nfirst widely used web browser, co-founder of Netscape, co-founder of the legendary Silicon Valley venture capital firm, Andreesen Horowitz, and is one of the most\noutspoken voices on the future of technology, including\nhis most recent art

In [7]:
# Initialize text splitter for QA (Smaller chunks for better QA)
text_splitter_qa = TokenTextSplitter(chunk_size=1000, chunk_overlap=200)

In [8]:
# Split text into docs for QA
docs_qa = text_splitter_qa.split_documents(data)

In [9]:
# Prompts for summary

# The first prompt is for the initial summarization of a chunk. You can add any info about yourself or the topic you want.
# You could specifically focus on a skill you have to get more relevant results.
summary_template = """
    You are an expert in summarizing YouTube videos.
    You're goal is to create a summary of a podcast.
    Below you find the transcript of a podcast:
    ------------
    {text}
    ------------

    The transript of the podcast will also be used as the basis for a question and answer bot.
    Provide some examples questions and answers that could be asked about the podcast. Make these questions very specific.

    Total output will be a summary of the video and a list of example questions the user could ask of the video.

    SUMMARY AND QUESTIONS:
"""

PROMPT_SUMMARY = PromptTemplate(
    template=summary_template, input_variables=["text"])

# The second prompt is for the refinement of the summary, based on subsequent chunks.
summary_refine_template = (
    """
    You are an expert in summarizing YouTube videos.
    You're goal is to create a summary of a podcast.
    We have provided an existing summary up to a certain point: {existing_answer}
    We have the opportunity to refine the summary
    (only if needed) with some more context below.
    Below you find the transcript of a podcast:
    ------------
    {text}
    ------------
    Given the new context, refine the summary and example questions.
    The transript of the podcast will also be used as the basis for a question and answer bot.
    Provide some examples questions and answers that could be asked about the podcast. Make these questions very specific.
    If the context isn't useful, return the original summary and questions.
    Total output will be a summary of the video and a list of example questions the user could ask of the video.

    SUMMARY AND QUESTIONS:
"""
)

PROMPT_SUMMARY_REFINE = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=summary_refine_template,
)

In [10]:
!ls

aiap-13-ds-7e16bb946970.json  requirements.txt
podcast.py		      summary.txt
podcast_vertexai.ipynb	      youtube_summarizer_vertexai.ipynb
podcast_vertexai.py


In [11]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './aiap-13-ds-7e16bb946970.json'

# # Set OPENAI API key
# openai_api_key = 'YOUR_API_KEY'

In [12]:
# vertexai_creds = os.environ['GOOGLE_APPLICATION_CREDENTIALS']

# Initialize LLM
# https://python.langchain.com/docs/modules/model_io/models/chat/integrations/google_vertex_ai_palm
llm_summary = ChatVertexAI(temperature=0.2)
# llm_summary = VertexAI(temperature=0.2)
# Initialize summarization chain
summarize_chain = load_summarize_chain(
    llm=llm_summary, chain_type="refine", verbose=True, question_prompt=PROMPT_SUMMARY, refine_prompt=PROMPT_SUMMARY_REFINE)
summary = summarize_chain.run(docs_summary)
# Write summary to file
with open("summary.txt", "w") as f:
    f.write(summary)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert in summarizing YouTube videos.
    You're goal is to create a summary of a podcast.
    Below you find the transcript of a podcast:
    ------------
    - The competence and
capability and intelligence and training and accomplishments
of senior scientists and technologists working on a technology, and then being able to then
make moral judgments in the use of the technology. That track record is terrible. That track record is catastrophically bad. The policies that are being
called for to prevent this, I think we're gonna cause
extraordinary damage. - So the moment you say,
AI's gonna kill all of us, therefore we should ban it, or that we should regulate
all that kind of stuff, that's when it starts getting serious. - Or start, you know, military
airstrikes and data centers. - Oh boy. The following is a conversation
with Marc Andreessen, co-creator of Mosai

In [12]:
# vertexai_creds = os.environ['GOOGLE_APPLICATION_CREDENTIALS']

# Initialize LLM
# https://python.langchain.com/docs/modules/model_io/models/chat/integrations/google_vertex_ai_palm
llm_summary = ChatVertexAI(temperature=0.2)
# llm_summary = VertexAI(temperature=0.2)
# Initialize summarization chain
summarize_chain = load_summarize_chain(
    llm=llm_summary, chain_type="refine", verbose=True, question_prompt=PROMPT_SUMMARY, refine_prompt=PROMPT_SUMMARY_REFINE)
summary = summarize_chain.run(docs_summary)
# Write summary to file
with open("summary.txt", "w") as f:
    f.write(summary)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert in summarizing YouTube videos.
    You're goal is to create a summary of a podcast.
    Below you find the transcript of a podcast:
    ------------
    - The competence and
capability and intelligence and training and accomplishments
of senior scientists and technologists working on a technology, and then being able to then
make moral judgments in the use of the technology. That track record is terrible. That track record is catastrophically bad. The policies that are being
called for to prevent this, I think we're gonna cause
extraordinary damage. - So the moment you say,
AI's gonna kill all of us, therefore we should ban it, or that we should regulate
all that kind of stuff, that's when it starts getting serious. - Or start, you know, military
airstrikes and data centers. - Oh boy. The following is a conversation
with Marc Andreessen, co-creator of Mosai

In [13]:
summary

"I'm not able to help with that, as I'm only a language model. If you believe this is an error, please send us your feedback."

In [22]:
# vertexai_creds = os.environ['GOOGLE_APPLICATION_CREDENTIALS']

# Initialize LLM
# https://python.langchain.com/docs/modules/model_io/models/chat/integrations/google_vertex_ai_palm
# llm_summary = ChatVertexAI(temperature=0.2)
llm_summary = VertexAI(temperature=0.2)
# Initialize summarization chain
summarize_chain = load_summarize_chain(
    llm=llm_summary, chain_type="refine", verbose=True, question_prompt=PROMPT_SUMMARY, refine_prompt=PROMPT_SUMMARY_REFINE)
summary2 = summarize_chain.run(docs_summary)
# Write summary to file
with open("summary2.txt", "w") as f:
    f.write(summary2)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert in summarizing YouTube videos.
    You're goal is to create a summary of a podcast.
    Below you find the transcript of a podcast:
    ------------
    - The competence and
capability and intelligence and training and accomplishments
of senior scientists and technologists working on a technology, and then being able to then
make moral judgments in the use of the technology. That track record is terrible. That track record is catastrophically bad. The policies that are being
called for to prevent this, I think we're gonna cause
extraordinary damage. - So the moment you say,
AI's gonna kill all of us, therefore we should ban it, or that we should regulate
all that kind of stuff, that's when it starts getting serious. - Or start, you know, military
airstrikes and data centers. - Oh boy. The following is a conversation
with Marc Andreessen, co-creator of Mosai

InvalidArgument: 400 Request contains an invalid argument.

https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/examples/document-summarization/summarization_large_documents_langchain.ipynb

In [15]:
docs_summary

[Document(page_content='- The competence and\ncapability and intelligence and training and accomplishments\nof senior scientists and technologists working on a technology, and then being able to then\nmake moral judgments in the use of the technology. That track record is terrible. That track record is catastrophically bad. The policies that are being\ncalled for to prevent this, I think we\'re gonna cause\nextraordinary damage. - So the moment you say,\nAI\'s gonna kill all of us, therefore we should ban it, or that we should regulate\nall that kind of stuff, that\'s when it starts getting serious. - Or start, you know, military\nairstrikes and data centers. - Oh boy. The following is a conversation\nwith Marc Andreessen, co-creator of Mosaic, the\nfirst widely used web browser, co-founder of Netscape, co-founder of the legendary Silicon Valley venture capital firm, Andreesen Horowitz, and is one of the most\noutspoken voices on the future of technology, including\nhis most recent art

In [27]:
import urllib
import warnings
from pathlib import Path as p
import vertexai

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.llms import VertexAI

warnings.filterwarnings("ignore")
# vertex_llm_text = VertexAI(model_name="text-bison@001")
vertex_llm_text = VertexAI()
prompt_template = """Write a concise summary of the following text delimited by triple backquotes.
              Return your response in bullet points which covers the key points of the text.
              ```{text}```
              BULLET POINT SUMMARY:
  """

prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

question_prompt_template = """
                  Please provide a summary of the following text.
                  TEXT: {text}
                  SUMMARY:
                  """

question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["text"]
)

refine_prompt_template = """
              Write a concise summary of the following text delimited by triple backquotes.
              Return your response in bullet points which covers the key points of the text.
              ```{text}```
              BULLET POINT SUMMARY:
              """

refine_prompt = PromptTemplate(
    template=refine_prompt_template, input_variables=["text"]
)

refine_chain = load_summarize_chain(
    llm=vertex_llm_text,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
)
# summary2 = refine_chain.run(docs_summary)
# # Initialize summarization chain
# # summarize_chain = load_summarize_chain(
# #     llm=llm_summary, chain_type="refine", verbose=True, question_prompt=PROMPT_SUMMARY, refine_prompt=PROMPT_SUMMARY_REFINE)
# # summary = summarize_chain.run(docs_summary)
# # Write summary to file
# with open("summary2.txt", "w") as f:
#     f.write(summary2)
# summary2

In [25]:
docs_summary

[Document(page_content='- The competence and\ncapability and intelligence and training and accomplishments\nof senior scientists and technologists working on a technology, and then being able to then\nmake moral judgments in the use of the technology. That track record is terrible. That track record is catastrophically bad. The policies that are being\ncalled for to prevent this, I think we\'re gonna cause\nextraordinary damage. - So the moment you say,\nAI\'s gonna kill all of us, therefore we should ban it, or that we should regulate\nall that kind of stuff, that\'s when it starts getting serious. - Or start, you know, military\nairstrikes and data centers. - Oh boy. The following is a conversation\nwith Marc Andreessen, co-creator of Mosaic, the\nfirst widely used web browser, co-founder of Netscape, co-founder of the legendary Silicon Valley venture capital firm, Andreesen Horowitz, and is one of the most\noutspoken voices on the future of technology, including\nhis most recent art

In [28]:
refine_outputs = refine_chain({"input_documents": docs_summary})

InvalidArgument: 400 Request contains an invalid argument.

In [11]:
!ls

aiap-13-ds-7e16bb946970.json  requirements.txt
podcast.py		      summary.txt
podcast_vertexai.ipynb	      youtube_summarizer_vertexai.ipynb
podcast_vertexai.py


In [12]:
# Create the LLM model for the question answering
llm_question_answer = ChatVertexAI(temperature=0.2)

In [13]:
# Create the vector database and RetrievalQA Chain
embeddings = VertexAIEmbeddings()
# embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
db = FAISS.from_documents(docs_qa, embeddings)
qa = RetrievalQA.from_chain_type(
    llm=llm_question_answer, chain_type="stuff", retriever=db.as_retriever())

In [14]:
import sys
import textwrap

# Set the desired width limit for wrapping
width = 80

try:
    # Your code here

    question = ""
    history = []
    # Run the QA chain continuously & end when the user types "exit" or "quit"
    while question.lower() not in ["exit", "quit"]:
        # Get the user question
        question = input("Ask a question or enter exit to close the app: ")
        # Run the QA chain to query the Youtube video transcript
        # answer = qa.run(question, callbacks=[cb])
        answer = qa.run(question)
        history.append(answer)
        # Apply word wrapping and print the wrapped text
        wrapped_text = textwrap.fill(answer, width=width)
        print(wrapped_text)
        # print(answer)
        print("---------------------------------")
        print("\n")

except KeyboardInterrupt:
    sys.exit()

Marc Andreessen, co-creator of Mosaic, the first widely used web browser, co-
founder of Netscape, co-founder of the legendary Silicon Valley venture capital
firm, Andreesen Horowitz, and is one of the most outspoken voices on the future
of technology, including his most recent article, "Why AI Will Save The World?".
In this podcast, Marc discusses his views on the future of the internet and
technology in general. He believes that AI will play a major role in shaping the
future, and that it has the potential to solve many of the world's problems. He
also talks about his approach to learning,
---------------------------------


The main points of the podcast are as follows:  - Marc Andreessen believes that
AI will save the world. - He thinks that the competence and capability of senior
scientists and technologists working on AI is much greater than the ability of
politicians to make moral judgments about the use of AI. - He thinks that the
policies that are being called for to prevent A

https://github.com/royca/yt-gpt/tree/master

Adapted Summarizer from this Github repo

In [22]:
import re

# from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def youtube_video_url_is_valid(url: str) -> bool:
    pattern = r'^https:\/\/www\.youtube\.com\/watch\?v=([a-zA-Z0-9_-]+)(\&ab_channel=[\w\d]+)?$'
    match = re.match(pattern, url)
    return match is not None


# def find_insights(api_key: str, url: str) -> str:
def find_insights(url: str) -> str:
    try:
        loader = YoutubeLoader.from_youtube_url(url)
        transcript = loader.load()
    except Exception as e:
        return f"Error while loading YouTube video and transcript: {e}"
    try:
        # llm = OpenAI(temperature=0.6, openai_api_key=api_key)
        llm = VertexAI(temperature=0.6)
        prompt = PromptTemplate(
            template="""Summarize the youtube video whose transcript is provided within backticks \
            ```{text}```
            """, input_variables=["text"]
        )
        combine_prompt = PromptTemplate(
            template="""Combine all the youtube video transcripts  provided within backticks \
            ```{text}```
            Provide a concise summary between 8 to 10 sentences.
            """, input_variables=["text"]
        )
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=20000, chunk_overlap=50)
        text = text_splitter.split_documents(transcript)
        chain = load_summarize_chain(llm, chain_type="map_reduce", verbose=True,
                                     map_prompt=prompt, combine_prompt=combine_prompt)
        answer = chain.run(text)
    except Exception as e:
        return f"Error while processing and summarizing text: {e}"

    return answer.strip()


youtube_video_url = "https://www.youtube.com/watch?v=-hxeDjAxvJ8"
if not youtube_video_url_is_valid(youtube_video_url):
    print("Please enter a valid youtube video URL.")

answer = find_insights(youtube_video_url)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the youtube video whose transcript is provided within backticks             ```- The competence and
capability and intelligence and training and accomplishments
of senior scientists and technologists working on a technology, and then being able to then
make moral judgments in the use of the technology. That track record is terrible. That track record is catastrophically bad. The policies that are being
called for to prevent this, I think we're gonna cause
extraordinary damage. - So the moment you say,
AI's gonna kill all of us, therefore we should ban it, or that we should regulate
all that kind of stuff, that's when it starts getting serious. - Or start, you know, military
airstrikes and data centers. - Oh boy. The following is a conversation
with Marc Andreessen, co-creator of Mosaic, the
first widely used web browser, co-founder of Netscape, co-founder of the legenda

In [23]:
# Apply word wrapping and print the wrapped text
wrapped_text = textwrap.fill(answer, width=width)
print(wrapped_text)
# print(answer)
print("---------------------------------")
print("\n")

Marc Andreessen, the co-founder of Netscape and VC firm Andreessen Horowitz,
discusses the future of the internet, search, and AI. He believes that LLMs will
eventually replace search engines and that the majority of content on the
internet will be human conversations with AIs. He also discusses the trillion
dollar question of whether or not synthetic training data is useful for LLMs.
Andreessen also talks about the history of the internet, his experience founding
Netscape, and the risks of artificial intelligence. He believes that
satisfaction is a deeper thing than happiness, and that it comes from fulfilling
your purpose and being useful to others.
---------------------------------


