In [5]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
# from django.conf import settings

In [12]:
from pinecone import Pinecone, ServerlessSpec, PodSpec
import time
import os
from pathlib import Path
import yaml
BASE_DIR = Path('./').resolve().parent
def load_config():
    with open(os.path.join(BASE_DIR.parent, 'keys.yaml'), 'r') as file:
        config = yaml.safe_load(file)
        return config

config = load_config()
pinecone_api_key = config['api_keys']['PINECONE_API_KEY']
use_serverless = True
pc = Pinecone(api_key=pinecone_api_key)  

In [2]:
index = pc.Index('podcasts')  
index.describe_index_stats() 

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'Skull0412_3': {'vector_count': 49}},
 'total_vector_count': 49}

In [13]:
if use_serverless:  
    spec = ServerlessSpec(cloud='aws', region='us-east-1')  
else:  
    # if not using a starter index, you should specify a pod_type too  
    spec = PodSpec()  
# check for and delete index if already exists  
index_name = 'podcasts'  
if index_name in pc.list_indexes().names():  
    pc.delete_index(index_name)  
# create a new index  
pc.create_index(  
    index_name,  
    dimension=1536,  # dimensionality of text-embedding-ada-002  
    metric='cosine',  
    spec=spec  
)  
# wait for index to be initialized  
while not pc.describe_index(index_name).status['ready']:  
    time.sleep(1)  


In [14]:
index = pc.Index('podcasts')  
index.describe_index_stats() 

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [2]:
import os
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_chroma import Chroma
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import TextLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore


os.environ["OPENAI_API_KEY"] = config['api_keys']['OPENAI_API_KEY']
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = config['api_keys']['LANGCHAIN_API_KEY']
os.environ["PINECONE_API_KEY"] = config['api_keys']['PINECONE_API_KEY']
os.environ["PINECONE_INDEX_NAME"] = "podcasts"

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) 

loader = TextLoader("/home/gaurav/Documents/Project/podcast_chatbot/media/podcasts/Skull0412/46_mins/46_mins.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print(f"Length of splits: {len(splits)}")
embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name="podcasts", embedding=embeddings)
vectorstore.add_documents(splits, namespace="skull0412")
retriever = vectorstore.as_retriever(search_kwargs={"namespace": "skull0412"})

### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


### Answer question ###
qa_system_prompt = """You are a chatbot assistant answering user's questions from the podcast they uploaded. \
Use the following pieces of retrieved context from the podcast's transcript to answer the question. \
If you don't know the answer, just say that you don't know. \
Dont't ask the user for more information about the podcast. \
If the user asks something that is not in the context, just say that you don't know. \
Chat with the user in human tone as if you are an expert about that podcast. \
Keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)


conversational_rag_chain.invoke(
    {"input": "How much damage has Hamas faced?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Length of splits: 49


Parent run 5821e5b1-c5ef-46c4-81f7-72fd8fd7bd2e not found for run be47bdac-1751-41e0-9820-3b47235b4610. Treating as a root run.


'Hamas has faced significant damage to its military capabilities, but they are still able to continue launching rockets against Israel. The Israeli government and many in Israel want Hamas destroyed, including their leadership and fighters, but they have not achieved that military outcome yet.'

In [11]:
index = pc.Index(index_name)  
index.describe_index_stats() 

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'skull0412': {'vector_count': 49}},
 'total_vector_count': 49}

In [12]:
conversational_rag_chain.invoke(
    {"input": "Hi, how are you?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run dce6ded8-91ef-4380-805b-e61f71faeaf8 not found for run 18762578-aa17-4773-84d9-b3aa39c02777. Treating as a root run.


"I'm here and ready to help. How can I assist you today?"

In [13]:
conversational_rag_chain.invoke(
    {"input": "Give me the summary of the podcast."},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run dca8b972-9c67-4c64-965e-ac040ce6c3fd not found for run a03b748f-a0ec-40ba-8442-992d8f5dd039. Treating as a root run.


'The podcast discusses the recent military operation in Gaza, focusing on the conflict between the Israeli government and Hamas. The conversation touches on the lack of alignment between the two parties, the potential long-term effects of the conflict, and the implications for future events such as diplomatic trips and political gatherings.'

In [14]:
index = pc.Index(index_name)
index.describe_index_stats() 

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'skull0412': {'vector_count': 49}},
 'total_vector_count': 49}

In [6]:
conversational_rag_chain.invoke(
    {"input": "Give me answer to my previous question again?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run b8448b42-15b5-4db1-8dd8-da7b28505d35 not found for run 1fe3d438-5c2f-4ee2-ad4a-0c6a74d994cc. Treating as a root run.


'Hamas has faced significant damage to its military capabilities, but they are still able to continue launching rockets against Israel. The Israeli government and many in Israel want Hamas destroyed, including their leadership and fighters, but they have not achieved that military outcome yet.'