### Talking to PDFs using Astra DB, Langchain and Vector Search

In [None]:
#!pip install -q cassio datasets langchain openai tiktoken

In [4]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
import os
from datasets import load_dataset
import cassio #for connection to DB


In [6]:
from PyPDF2 import PdfReader
pdf_reader = PdfReader("attention.pdf")

In [7]:
from typing_extensions import Concatenate

raw_text =''
for i,page in enumerate(pdf_reader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [9]:
# initialize connection to database
cassio.init(token = os.environ['ASTRA_DB_APPLICATION_TOKEN'],database_id=os.environ['ASTRA_DB_ID'])

In [14]:
# create langchain embeddingand LLM objects
llm = OpenAI(openai_api_key = os.environ['OPENAI_API_KEY'])
embedding = OpenAIEmbeddings(openai_api_key = os.environ['OPENAI_API_KEY'])

In [15]:
# create Langchain vector store with astra DB

astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None
)

In [16]:
from langchain.text_splitter import CharacterTextSplitter

# splitting the text sothatit shouldn't increase token size

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

### Load Dataset into the vector store

In [19]:
astra_vector_store.add_texts(texts[:50])
print("Inserted %i headlines." % len(texts[:50]))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 50 headlines.


### Q&A Time

In [20]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's the next question (or type 'quit' to exit): ").strip()
    
    if query_text.lower() =="quit":
        break
    if query_text.lower() == "":
        continue
    first_question = False
    print("\nQuestion: \"%s\""% query_text)
    answer = astra_vector_index.query(query_text,llm=llm).strip()
    print("Answer: \"%s\"\n" % answer)
    
    print("First Documents by relevance")
    for doc,score in astra_vector_store.similarity_search_with_score(query_text,k=4):
        print( "[%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


Question: "What was the BLEU scorefor the model?"
Answer: "The BLEU score for the model was 28.4."

First Documents by relevance
[0.9411] "BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of t ..."
[0.9125] "ModelBLEU Training Cost (FLOPs)
EN-DE EN-FR EN-DE EN-FR
ByteNet [18] 23.75
Deep-Att  ..."
[0.9123] "less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-
to-German  ..."
[0.9114] "positional encodings in both the encoder and decoder stacks. For the base model, we  ..."
