Install the required dependencies:

In [1]:
!pip install -q cassio datasets langchain openai tiktoken PyPDF2 sentence-transformers google-generativeai InstructorEmbedding

Import the packages you'll need:

In [2]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import GooglePalm
import pandas as pd
from langchain.embeddings import HuggingFaceInstructEmbeddings
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
import cassio

### Setup

In [5]:
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:lbdlWfmvUAdAoUJDTcZCetFL:cbb8de2433999ac14ed158720bf7f661aecf3044879d1dbf18bbd070a1dda642" # enter the "AstraCS:..." string found in in your Token JSON file
ASTRA_DB_ID = "5a303314-83a7-42c1-acd0-451f10c31cf4" # enter your Database ID
GOOGLE_API_KEY = "AIzaSyDNqmQhc1G3U6oK5DyR-_zY6rsl62zdu9o" # enter your OpenAI key

#### Provide your secrets:

Replace the following with your Astra DB connection details and your OpenAI API key:

In [6]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('output.pdf')

In [7]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

Initialize the connection to your database:



In [8]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

Create the HuggingFace embedding and LLM objects for later usage:

In [12]:
llm = GooglePalm(google_api_key=GOOGLE_API_KEY)
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")

load INSTRUCTOR_Transformer
max_seq_length  512


Create your LangChain vector store ... backed by Astra DB!

In [13]:
astra_vector_store = Cassandra(
    embedding=embeddings,
    table_name="temp_db",
    session=None,
    keyspace=None,
)

In [14]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

### Load the dataset into the vector store



In [15]:
astra_vector_store.add_texts(texts)
print("Inserted %i headlines." % len(texts))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 425 headlines.


In [43]:
# Initialize empty lists to store data
questions = []
answers_and_document_contents = []

first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    questions.append(query_text)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    doc_contents = []
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=1):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content))
        # Concatenate answer and document content
        doc_content_with_answer = f"{answer} {doc.page_content}"
        doc_contents.append(doc_content_with_answer)
    answers_and_document_contents.append(doc_contents)

# Create a DataFrame
data = {
    'Question': questions,
    'Answer_and_Document_Content': answers_and_document_contents
}
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv('questions_and_answers.csv', index=False)



QUESTION: "Canoo Overview"
ANSWER: "Canoo is an American automotive company that designs and manufactures electric vehicles."

FIRST DOCUMENTS BY RELEVANCE:
    [0.9410] "Canoo is an American automotive company based in Torrance, California, that develops and
manufactures electric vehicles., Canoo's research development team is based in Michigan, in the
Detroit region Auburn Hills, Livonia , and production operations are in Justin, Texas., The company also
plans to produce commercial electric vehicles such as vans for fleet, vehicle rental and ride sharing
services., Canoo was founded in under the name Evelozcity by Stefan Krause and Ulrich Kranz.,
Krause worked for Deutsche Bank as its chief financial officer while Kranz worked for BMW as a senior
executive., Both men met at rival EV company Faraday Future before leaving together to form their own
company in , due to disagreement with Faraday Future's leadership., Krause took on the role of chief ..."

QUESTION: "Industry in which Ca

Output

In [44]:
df.head(5)

Unnamed: 0,Question,Answer_and_Document_Content
0,Canoo Overview,[Canoo is an American automotive company that ...
1,"Industry in which Canoo operates, along with i...","[The electric vehicle industry, which is proje..."
2,"Analyze Canoo's main competitors, including th...","[Canoo's main competitors are Tesla, Rivian, L..."
3,"Identify key trends in the market, including c...","[The market is shifting towards EVs, as consum..."
4,Gather information on Canoo's financial perfor...,[Canoo's revenue was $8.2 million in the first...
