In [47]:
# pip install openai
# pip install PyPDF2

# pip install faiss-cpu
# pip install sentence_transformers

In [48]:
import PyPDF2
import openai
import pandas as pd

# Reading PDF

In [49]:
pdf_file = open("tesla.pdf",'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)

## Implementing RAG

#### Creating chunk of each page (not using)

In [50]:
# page_text = []
# for page_num in range(len(pdf_reader.pages)):
#     page_text.append(pdf_reader.pages[page_num].extract_text().lower())
# df = pd.DataFrame({"page":page_text})
# df['page'] = df.page.apply(lambda x: x.replace("\n",""))

#### Creating chunk of 100 words each

In [51]:
page_text = ""
for page_num in range(len(pdf_reader.pages)):
    page_text += (pdf_reader.pages[page_num].extract_text().lower())
page_text = page_text.replace("\n","")
page_text = page_text.split(" ")
page_text = list(filter(None, page_text))

In [52]:
i=0
chunk_list = []
while i < len(page_text):
    chunk = " " 
    for i in range(i,i+100):
        chunk += page_text[i]
        chunk += " "
    chunk_list.append(chunk)
    i += 100
page_text = chunk_list
df = pd.DataFrame({"page":page_text})

In [53]:
len(chunk_list)

312

In [54]:
df['id'] = df.index

#### RAG using FAISS (creating embeddings and storing in vector index)

In [55]:
from sentence_transformers import SentenceTransformer

In [56]:
model = SentenceTransformer(
    "all-MiniLM-L6-v2")
faiss_embedding = model.encode(df.page.values.tolist())

len(faiss_embedding),len(faiss_embedding[0])

(312, 384)

In [57]:
import faiss
import numpy as np

In [58]:
pdf_to_index = df.set_index(['id'],drop=False)

In [59]:
id_index = np.array(pdf_to_index.id.values).flatten().astype('int')

In [60]:
content_encoded_normalized = faiss_embedding.copy()
faiss.normalize_L2(content_encoded_normalized)

In [61]:
index_content = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_embedding[0])))
index_content.add_with_ids(content_encoded_normalized,id_index)

##### Search function for Retrieval of nearest chunks/vectors

In [179]:
def search_context(query):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)
    
    top_k = index_content.search(query_vector,3)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()
    results = pdf_to_index.loc[ids]
    results['similarity'] = similarities
    
    return results

##### Defininig the query and fetching the required chunks

In [180]:
# query = "Financial Performance of tesla"
query = "Operation Efficiency of tesla"

res = search_context(query)

In [181]:
text_summarize = " "
for item in res.page:
    text_summarize += item
    text_summarize += " "

##### Using retrieved chunks to generate the required summary using Hugging Face models

In [182]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [183]:
model_id = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    cache_dir = "")
lm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir = "")

In [184]:
pipe = pipeline(
    "text-generation", model = lm_model, tokenizer = tokenizer, max_new_tokens = 128, device_map = 'auto')

In [185]:
# question = "summarizing Tesla's financial performance"
question = "summarizing Tesla's Operational Efficiency"

In [186]:
prompt_template = f"Relevant context: {text_summarize}\n\n The user's question: {question}"

In [187]:
lm_response = pipe(prompt_template)
print(lm_response[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Relevant context:   and service operations to meet demand; competition and uncertainty regarding the future of electric vehicles or our other products and services; ourquarterly production and sales performance compared with market expectations; and other factors including those over which we have no control. in particular,tesla’s products, business, results of operations, and statements and actions of tesla and its management are subject to significant amounts of commentary by arange of third parties. such attention can include criticism, which may be exaggerated or unfounded, such as speculation regarding the sufficiency or stabilityof our management team. any such negative perceptions, whether caused by us or   substantiallyincrease our production and installation capabilities. if we experience production delays or inaccurately forecast demand, our business, financial condition andoperating results may be harmed.moreover, because of our unique expertise with our vehicles, we recomme

#### Results can be improved by exploring a better model (Chatgpt 4), redefining the chunk size.