In [78]:
import os
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter
import tempfile
import whisper
from pytubefix import YouTube
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_pinecone import PineconeVectorStore

In [79]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

In [80]:
model = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)

# Testing Basic Invoke

In [81]:
model.invoke("Is pickleball famous in India?")

AIMessage(content="Pickleball is still relatively new in India compared to more established sports like cricket, football, or badminton. However, its popularity has been growing steadily in recent years. The sport's easy-to-learn nature and the minimal equipment required make it accessible to a wide range of people. Several local clubs and communities have started promoting the sport, and there have been efforts to organize tournaments and build dedicated courts.\n\nThe All India Pickleball Association (AIPA) was established to promote the sport, and it has been working towards increasing awareness and participation. While it may not yet be a mainstream sport in India, its growth trajectory suggests that it could become more popular in the coming years.", response_metadata={'token_usage': {'completion_tokens': 137, 'prompt_tokens': 14, 'total_tokens': 151}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_c4e5b6fa31', 'finish_reason': 'stop', 'logprobs': None}, id='run-0537

In [82]:
parser = StrOutputParser()
chain = model | parser
chain.invoke("Is pickleball famous in India?")

"As of my last update, pickleball is not as widely known or popular in India as more traditional sports like cricket, football (soccer), or badminton. However, the sport has been gradually gaining attention and a growing community of enthusiasts. Pickleball clubs and associations have started to form in various cities, and there have been efforts to introduce the sport in schools and recreational centers.\n\nThe popularity of pickleball has been on the rise globally, and India is no exception to this trend, albeit at a slower pace compared to some Western countries. With increased awareness, media coverage, and support from sports organizations, pickleball has the potential to become more popular in India in the coming years.\n\nIf you're interested in pickleball in India, you might want to look for local clubs, community centers, or social media groups dedicated to the sport to get involved."

In [83]:
template = """
Answer the question based on the context below. If you can't
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="I am deril",question="who is deril?")

'Human: \nAnswer the question based on the context below. If you can\'t\nanswer the question, reply "I don\'t know".\n\nContext: I am deril\n\nQuestion: who is deril?\n'

In [84]:
chain = prompt | model | parser
chain.invoke({
    "context": "I am Deril",
    "question": "Who am I?"
})

'You are Deril.'

In [85]:
translate_prompt = ChatPromptTemplate.from_template(
    """
    Translate {answer} to {language}
    """
)

translate_chain = (
    {"answer":chain, "language":itemgetter("language")} | translate_prompt | model | parser
)

translate_chain.invoke({
    "language": "Hindi",
    "context": "I have one brother", # This is a new context, unrelated to previous one
    "question": "Who is my brother?"
})

'मैं नहीं जानता। (for a male speaker)\nमैं नहीं जानती। (for a female speaker)'

In [86]:
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=nAmC7SoVLd8"

# Let's do this only if we haven't created the transcription file yet.
if not os.path.exists("transcription.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio=True).first()

    # Let's load the base model. This is not the most accurate
    # model but it's fast.
    whisper_model = whisper.load_model("base")

    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

        with open("transcription.txt", "w") as file:
            file.write(transcription)

In [87]:
with open("transcription.txt") as f:
    transcription = f.read()
transcription[0:100]

"I think it's possible that physics has exploits and we should be trying to find them. arranging some"

In [88]:
chain.invoke({
    "context": transcription,
    "question": "Who is the speaker?"
})

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4o in organization org-zflid57cHI5K35GwBTO5jxib on tokens per min (TPM): Limit 30000, Requested 54002. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

Model gpt-4o has limited tokens, we cannot pass the entire document as the context

In [89]:
loader = TextLoader("transcription.txt")
text = loader.load()

In [90]:
splitter = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=20)
splitter.split_documents(text)[:5]

[Document(metadata={'source': 'transcription.txt'}, page_content="I think it's possible that physics has exploits and we should be trying to find them. arranging some"),
 Document(metadata={'source': 'transcription.txt'}, page_content='arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow,'),
 Document(metadata={'source': 'transcription.txt'}, page_content='buffer overflow, somehow gives you a rounding error in the floating point. Synthetic intelligences'),
 Document(metadata={'source': 'transcription.txt'}, page_content="intelligences are kind of like the next stage of development. And I don't know where it leads to."),
 Document(metadata={'source': 'transcription.txt'}, page_content='where it leads to. Like at some point, I suspect the universe is some kind of a puzzle. These')]

In [91]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
docs = splitter.split_documents(text)

In [92]:
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
embed_ques = embeddings.embed_query("Who is the speaker?")
len(embed_ques)

1536

In [93]:
embed_ques[:10]

[-0.003042836906388402,
 -0.015835430473089218,
 -0.0023328415118157864,
 -0.0020000312943011522,
 -0.004573764279484749,
 0.029059091582894325,
 -0.012412238866090775,
 0.0008383649401366711,
 -0.008317087776958942,
 -0.00479563744738698]

In [94]:
pos_ans_1 = embeddings.embed_query("I am the speaker!")
pos_ans_2 = embeddings.embed_query("speakers are used to play the audio")

In [95]:
cosine_similarity([embed_ques], [pos_ans_1])

array([[0.91591555]])

In [96]:
cosine_similarity([embed_ques], [pos_ans_2])

array([[0.80963791]])

# Testing Vectorstores

In [97]:
vectorstore1 = DocArrayInMemorySearch.from_texts(
    [
        "Mary's sister is Susana",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Pedro's mother is a teacher",
        "Lucia drives an Audi",
        "Mary has two siblings",
        "Mercedes has amazing automobiles"
    ],
    embedding=embeddings,
)

In [98]:
vectorstore1.similarity_search_with_score(query="Who is my bro?",k=3)

[(Document(page_content='John and Tommy are brothers'), 0.8158666055522346),
 (Document(page_content='Mary has two siblings'), 0.7817187450266622),
 (Document(page_content="Mary's sister is Susana"), 0.7631529006877964)]

In [99]:
retriever1 = vectorstore1.as_retriever()
retriever1.invoke("Brothaaa")

[Document(page_content='John and Tommy are brothers'),
 Document(page_content='Mercedes has amazing automobiles'),
 Document(page_content='Mary has two siblings'),
 Document(page_content="Mary's sister is Susana")]

In [100]:
setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("who is my brohter?")

{'context': [Document(page_content='John and Tommy are brothers'),
  Document(page_content='Mary has two siblings'),
  Document(page_content="Mary's sister is Susana"),
  Document(page_content="Pedro's mother is a teacher")],
 'question': 'who is my brohter?'}

In [101]:
chain = setup | prompt | model | parser
chain.invoke("Does Patricia like white cars?")

'Yes, Patricia likes white cars.'

# Try out Text splitter approach

In [104]:
vectorstore2 = DocArrayInMemorySearch.from_documents(
    docs, embedding=embeddings
)

In [105]:
docs[0]

Document(metadata={'source': 'transcription.txt'}, page_content="I think it's possible that physics has exploits and we should be trying to find them. arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you a rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to. Like at some point, I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Kappathi, previously the director of AI at Tesla. And before that, at OpenAI and Stanford, he is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors and now to your friends. Here's Andre Kappathi. What is a neural network? And what does it seem to do such a surprisingly good job of learning?

In [106]:
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is AGI? Give in 500 words")

'Artificial General Intelligence (AGI) refers to a type of artificial intelligence that possesses the ability to understand, learn, and apply knowledge across a wide range of domains at a level comparable to human intelligence. Unlike narrow AI, which is designed to perform specific tasks such as language translation or image recognition, AGI aims to replicate the versatile and adaptive problem-solving capabilities of the human mind.\n\nThe concept of AGI is rooted in the idea that an intelligent system should be able to handle any intellectual task that a human being can. This involves not just performing a wide variety of tasks but also understanding the context, learning from experience, and adapting to new and unforeseen challenges. AGI is often seen as the ultimate goal of AI research, encapsulating the dream of creating machines that can think and reason like humans.\n\nOne of the key challenges in developing AGI is understanding and replicating human cognition. Human intelligenc

# Optimize with Pinecone

In [107]:
index = "youtube-index"
pinecode = PineconeVectorStore.from_documents(
    docs, embedding=embeddings, index_name=index
)

In [108]:
chain = (
    {"context": pinecode.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is AGI? Give in 500 words")

'Based on the context provided, AGI, or Artificial General Intelligence, refers to a type of artificial intelligence that aims to understand, learn, and apply knowledge across a wide range of tasks at a level comparable to human intelligence. Unlike narrow AI, which is designed for specific tasks (such as language translation or facial recognition), AGI has the capability to perform any intellectual task that a human can.\n\nThe discussion about AGI in the provided context highlights several key points:\n\n1. **Meta Problem**: AGI is seen as the "ultimate meta problem" because it aims to solve the challenge of creating a system capable of addressing all problems simultaneously. The notion here is that instead of focusing on individual problems, developing AGI means creating a form of intelligence that can autonomously tackle any given problem.\n\n2. **Automation of Intelligence**: One of the goals of AGI is to automate intelligence itself. This involves creating systems that can unders