In [None]:
!pip install pinecone sentence-transformers transformers torch

Collecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collect

In [18]:
  import os
  import pinecone
  from sentence_transformers import SentenceTransformer
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  import torch
  from typing import List, Dict, Any

  class RAGSystem:
      def __init__(self, pinecone_api_key: str, index_name: str, model_name: str = "sentence-transformers/all-mpnet-base-v2",
                  llm_model_name: str = "meta-llama/Llama-2-7b-chat-hf"):
          """
          Initialize RAG system with Pinecone and models.

          Args:
              pinecone_api_key: Your Pinecone API key
              index_name: Name of the Pinecone index to query
              model_name: Name of the SentenceTransformer model for embeddings
              llm_model_name: Name of the language model for answer generation
          """
          self.tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
          # Initialize Pinecone client
          self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
          self.index = self.pc.Index(index_name)

          # Initialize embedding model
          self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
          print(f"Using device: {self.device}")
          self.embedding_model = SentenceTransformer(model_name).to(self.device)

          self.llm = AutoModelForSeq2SeqLM.from_pretrained(
            llm_model_name,
            torch_dtype=torch.float16 if self.device == 'cuda' else torch.float32,
            device_map="auto"
          )

          # Set parameters
          self.top_k = 5  # Number of chunks to retrieve

      def embed_query(self, query: str) -> List[float]:
          """
          Generate embedding for the query using SentenceTransformer.

          Args:
              query: The user's question

          Returns:
              Vector embedding of the query
          """
          return self.embedding_model.encode(query).tolist()

      def retrieve_relevant_chunks(self, query_embedding: List[float], top_k: int = 5) -> List[Dict[str, Any]]:
          """
          Retrieve the most relevant chunks from Pinecone based on the query embedding.

          Args:
              query_embedding: The embedding of the user's question
              top_k: Number of most relevant chunks to retrieve

          Returns:
              List of relevant chunks with their metadata
          """
          # Query Pinecone index
          query_response = self.index.query(
              vector=query_embedding,
              top_k=top_k,
              include_metadata=True
          )

          # Extract matches with their metadata
          matches = query_response['matches']

          # Format results
          results = []
          for match in matches:
              # Extract text sample from metadata if available
              text = match.metadata.get('text_sample', 'No text available')

              # Format entities if available
              entities = match.metadata.get('chunk_entities', '[]')

              # Format the result
              result = {
                  'id': match.id,
                  'score': match.score,
                  'text': text,
                  'entities': entities,
                  'video_id': match.metadata.get('video_id', 'unknown'),
                  'chunk_id': match.metadata.get('chunk_id', -1)
              }
              results.append(result)

          return results

      def generate_answer(self, query: str, relevant_chunks: List[Dict[str, Any]]) -> str:
          """
          Generate an answer using the language model based on the query and relevant chunks.

          Args:
              query: The user's question
              relevant_chunks: List of relevant chunks retrieved from Pinecone

          Returns:
              Generated answer
          """
          # Prepare the context by combining the relevant chunks
          context = "\n\n".join([f"Chunk {i+1} (Score: {chunk['score']:.2f}): {chunk['text']}"
                              for i, chunk in enumerate(relevant_chunks)])

          # Prepare the prompt for the language model
          prompt = f"""
          You are a helpful AI assistant. Use the following context from video transcripts to answer the user's question.

          CONTEXT:
          {context}

          USER QUESTION:
          {query}

          Answer the question based on the provided context. If the context doesn't contain enough information to answer the question fully, acknowledge that and provide the best possible answer with the available information.

          ANSWER:
          """

          # Tokenize the prompt
          inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

          # Generate a response
          with torch.no_grad():
              outputs = self.llm.generate(
                  inputs["input_ids"],
                  max_length=2048,
                  temperature=0.7,
                  top_p=0.9,
                  num_return_sequences=1
              )

          # Decode the response
          response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

          # Extract the answer part from the response
          answer = response.split("ANSWER:")[1].strip() if "ANSWER:" in response else response

          return answer

      def query(self, question: str) -> Dict[str, Any]:
          """
          Process a user query and return an answer with supporting evidence.

          Args:
              question: The user's question

          Returns:
              Dictionary containing the answer and supporting evidence
          """
          # Step 1: Generate embedding for the question
          print(f"Generating embedding for question: {question}")
          query_embedding = self.embed_query(question)

          # Step 2: Retrieve relevant chunks from Pinecone
          print(f"Retrieving relevant chunks from Pinecone...")
          relevant_chunks = self.retrieve_relevant_chunks(query_embedding, self.top_k)

          # Step 3: Generate an answer based on the retrieved chunks
          print(f"Generating answer...")
          answer = self.generate_answer(question, relevant_chunks)

          # Step 4: Return the answer and supporting evidence
          result = {
              "question": question,
              "answer": answer,
              "sources": relevant_chunks
          }

          return result


  # Example usage
  def main():
      # Initialize the RAG system
      pinecone_api_key = "pcsk_7EKroD_MaZi2zjikyZTdpaDPCkit4qEAE6cjKuJ7C2ot9htS7EE6uurWQLrfznykMd7bW3"
      index_name = "embeddings"

      print("Initializing RAG system...")
      rag = RAGSystem(
          pinecone_api_key=pinecone_api_key,
          index_name=index_name,
          # For demo purposes, we can use a smaller model - replace with your preferred model
          llm_model_name="google/flan-t5-base"  # Smaller model for demonstration
      )

      # Example query
      question = "What is machine learning?"

      print(f"Processing query: '{question}'")
      result = rag.query(question)

      # Print the result
      print("\n" + "="*50)
      print("QUESTION:")
      print(question)
      print("\nANSWER:")
      print(result["answer"])
      print("\nSOURCES:")
      for i, source in enumerate(result["sources"]):
          print(f"{i+1}. Score: {source['score']:.2f}, Text: {source['text'][:100]}...")
      print("="*50)

  if __name__ == "__main__":
      main()

Initializing RAG system...
Using device: cpu
Processing query: 'What is machine learning?'
Generating embedding for question: What is machine learning?
Retrieving relevant chunks from Pinecone...


Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors


Generating answer...

QUESTION:
What is machine learning?

ANSWER:
a sub domain of computer science

SOURCES:
1. Score: 0.63, Text: according to Wikipedia machine learning is a field of study in artificial intelligence concerned wit...
2. Score: 0.60, Text: we know humans learn from their past experiences and machines follow instructions given by humans bu...
3. Score: 0.60, Text: a small example in one of the many machine learning algorithms quite easy right believe me it is but...
4. Score: 0.59, Text: label, and I know the true label here is G. So this is this is actually supervised learning. All rig...
5. Score: 0.57, Text: computers are really, really good at understanding math, right at understanding numbers, they're not...
