In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

# PINECONE SetUp

In [2]:
from pinecone import Pinecone, ServerlessSpec

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 0. load your .env
PINECONE_API_KEY       = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT   = os.getenv("PINECONE_ENVIRONMENT")
INDEX_NAME             = os.getenv("INDEX_NAME")

In [4]:
# 1. instantiate the client
pc = Pinecone(
    api_key    = PINECONE_API_KEY,
    environment= PINECONE_ENVIRONMENT
)

In [5]:
# 2. create the index if missing
existing = pc.list_indexes().names() 
if INDEX_NAME not in existing:
    pc.create_index(
        name      = INDEX_NAME,
        dimension = 1024,
        metric    = "cosine",
        spec      = ServerlessSpec(
            cloud ="aws",
            region= PINECONE_ENVIRONMENT
        )
    )

In [6]:
# 3. connect to it
index = pc.Index(INDEX_NAME)

# Querying index and plugging into a QA chain

In [7]:
# 0. Embedding setup
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(
    model="embed-english-v3.0",
    cohere_api_key=os.getenv("COHERE_API_KEY")
)

In [8]:
# 1. Embed your question
query = "What was Nvidia’s revenue growth in 2024 vs. 2023?"
query_vector = embeddings.embed_query(query)

In [9]:
# 2. Fetch top K from Pinecone
k = 5
resp = index.query(
    vector            = query_vector,
    top_k             = k,
    include_metadata  = True
)

In [10]:
# 3. Grab the raw text from each match
docs = [match.metadata["chunk_content"] for match in resp.matches]

In [11]:
# 4. Build a single-prompt context
context = "\n\n---\n\n".join(docs)
prompt = f"""
Use ONLY the following document excerpts to answer the question.
If it’s not in the excerpts, say “I don’t know.”

{context}

Question: {query}
Answer:
"""

In [12]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain_pinecone.vectorstores import PineconeVectorStore
from langchain.chains import RetrievalQA

[2025-05-01 00:50:24,704] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX

In [13]:
# 5. Call the LLM
# Set up your HF text‐generation pipeline
#    - text2text-generation means it expects an instruction + returns text.
hf_pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device_map="auto",  # uncomment if you have a GPU and accelerate installed
    max_length=512,
    do_sample=False,
)

# Wrap it for LangChain
llm = HuggingFacePipeline(pipeline=hf_pipe)

# Pinecone vectorstore (unchanged)
vectorstore = PineconeVectorStore(
    index_name=INDEX_NAME,        # your Pinecone index name
    embedding=embeddings,         # an Embeddings object (e.g. CohereEmbeddings)
    text_key="chunk_content",
    namespace=None
)

# Build the RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # stuff all chunks together
    retriever=vectorstore.as_retriever(
        search_kwargs={"k": 5}
    ),
)

Device set to use cuda:1
  llm = HuggingFacePipeline(pipeline=hf_pipe)


In [14]:
# Run your QA
result = qa.invoke({"query": "What was Nvidia’s revenue growth in 2024 vs. 2023?"})
print(result)
print(result["result"])

{'query': 'What was Nvidia’s revenue growth in 2024 vs. 2023?', 'result': '2.2 billion'}
2.2 billion
