# Question & Answering Using LangChain


## 1.0 Loading packages


In [8]:
import os
import re
import pickle

from dotenv import load_dotenv
from langchain import FAISS, OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

## 2.0 Reading OpenAI API Key from .env file


In [9]:
# Load environment variables from a .env file
cwd = os.getcwd()
dotenv_filepath = os.path.join(cwd, "../../.env")
load_dotenv(dotenv_filepath)

# Create a connection string that includes your Azure SQL Server details,
# such as the server name, database name, username, and password.
OPEN_AI_API_KEY = os.environ.get("OPENAI_API_KEY")

## 3.0 Defining function to create and load embeddings from external documents


In [10]:
class EmbeddingsService:
    CURRENT_DIRECTORY = "."
    FILE_PATH_ASSETS = "../../data/qna_doc_retrieval"
    EMBEDDINGS_PATH = FILE_PATH_ASSETS + "/embeddings"

    def get_embedded_file_path(document_name):
        return (
            EmbeddingsService.EMBEDDINGS_PATH + "/" + document_name + ".embedding.pkl"
        )

    def get_raw_file_path(document_name):
        return EmbeddingsService.FILE_PATH_ASSETS + "/raw/" + document_name

    @staticmethod
    def create_embeddings(file_name):
        if file_name == "ALL":
            # iterate over all files in app/assets/raw, and create embeddings for each
            for file in os.listdir(EmbeddingsService.FILE_PATH_ASSETS + "/raw"):
                EmbeddingsService.create_embeddings_for_file(file)

    @staticmethod
    def create_embeddings_for_file(file_name):
        # check if embedding file already exists.  It would exist in the folder app/assets/embeddings with filename +
        # .embedding.pkl.  If it exists, then skip it.  If it does not exist, then create it.
        embedded_file_path = EmbeddingsService.get_embedded_file_path(file_name)
        if os.path.exists(embedded_file_path):
            print("Embeddings file already exists.  Skipping..." + embedded_file_path)
            return
        else:
            raw_file_path = EmbeddingsService.get_raw_file_path(file_name)
            print(
                "Creating embeddings for file: "
                + raw_file_path
                + " and saving to: "
                + embedded_file_path
            )
            EmbeddingsService.create_embeddings_and_save(
                raw_file_path, embedded_file_path
            )
            print("Embeddings created successfully for: " + embedded_file_path)
            return

    @staticmethod
    def create_embeddings_and_save(raw_file_path, embedded_file_path):
        print("Creating embeddings...")
        with open(raw_file_path) as f:
            file_to_split = f.read()
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        texts = text_splitter.split_text(file_to_split)
        embeddings = OpenAIEmbeddings()

        # Vector store.  Object which stores the embeddings and allows for fast retrieval.
        docsearch = FAISS.from_texts(
            texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))]
        )

        v = [docsearch, texts]

        # save to pickle
        with open(embedded_file_path, "wb") as f:
            pickle.dump(v, f)

    @staticmethod
    def load_embeddings(document_name):
        embedded_file_path = EmbeddingsService.get_embedded_file_path(document_name)
        if os.path.exists(embedded_file_path):
            print("Loading embeddings from file...")
            with open(embedded_file_path, "rb") as f:
                docsearch, texts = pickle.load(f)
        else:
            raise Exception(
                "Embeddings file does not exist.  Please create embeddings file first."
            )
        return {"docsearch": docsearch, "texts": texts}

## 4.0 Defining function to find similar embeddings from documents based on query (question) and returning answer


In [11]:
class AnswerRetriever:
    def get_answer(self, embeddings, query):
        texts = embeddings["texts"]
        docsearch = embeddings["docsearch"]
        docs = docsearch.similarity_search(query)
        chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
        answer = chain(
            {"input_documents": docs, "question": query}, return_only_outputs=True
        )
        sources_indexes = re.findall(r"\d+", answer["output_text"].splitlines()[-1])
        sources_indexes = [int(i) for i in sources_indexes]
        sources_list = []
        for idx in sources_indexes:
            sources_list.append(texts[idx])

        # remove sources from answer
        answer_str = answer["output_text"].split("\nSOURCES:")[0]

        response = {"answer": answer_str, "sources": sources_list}

        # code to load embeddings
        return response

## 5.0 Create embeddings from documents


In [12]:
EmbeddingsService.create_embeddings("ALL")

Embeddings file already exists.  Skipping...../../data/qna_doc_retrieval/embeddings/nvidia_q4_2023_earnings_call_transcript.txt.embedding.pkl
Embeddings file already exists.  Skipping...../../data/qna_doc_retrieval/embeddings/tsla_earnings_transcript_q4_2022.txt.embedding.pkl


## 6.0 Load embeddings from documents


In [13]:
document = "tsla_earnings_transcript_q4_2022.txt"
embeddings = EmbeddingsService.load_embeddings(document)

Loading embeddings from file...


## 7.0 Question & Answering


In [15]:
query = "what are the main points from the call?"
results = AnswerRetriever().get_answer(embeddings, query)

print("AI: {}".format(results["answer"]), "\n")
print("----------", "\n")
print("Sources from document: ", "\n")
for i, source in enumerate(results["sources"]):
    print("Source {}:".format(i))
    print(source, "\n")

AI:  The main points from the call include customer interest in Tesla's products remaining high, plans to rapidly increase volume while improving overhead efficiency, and the need to redesign the supply chain to make it more efficient.  

---------- 

Sources from document:  

Source 0:
These improvements include our continued work to gradually move toward a regionally balanced build of vehicles. The energy business had its strongest year yet across all metrics, led by steady improvement in both retail and commercial storage. While much work remains to grow this business and improve costs, we believe we are on a good trajectory. As we look toward 2023, we are moving forward aggressively leveraging our strength and cost.

There are three key points I wanted to make here. First, on demand, as Elon mentioned, customer interest in our products remains high. Second, on cost reduction, we're holding steady on our plans to rapidly increase volume while improving overhead efficiency, which is 

In [16]:
query2 = "Pls summarize or extract out the numbers from the earning call report"
results2 = AnswerRetriever().get_answer(embeddings, query2)

print("AI: {}".format(results2["answer"]), "\n")
print("----------", "\n")
print("Sources from document: ", "\n")
for i, source in enumerate(results2["sources"]):
    print("Source {}:".format(i))
    print(source, "\n")

AI:  In Q4 2022, Tesla reported revenue increased over 50%, operating income doubled, free cash flows increased over 50%, and their margins remained industry-leading. Additionally, non-GAAP opex as a percentage of revenue improved further. 

---------- 

Sources from document:  

Source 0:
Tesla (TSLA 10.97%)
Q4 2022 Earnings Call
Jan 25, 2023, 5:30 p.m. ET

Contents:
Prepared Remarks
Questions and Answers
Call Participants
Prepared Remarks:

Martin Viecha

Good afternoon, everyone, and welcome to Tesla's fourth quarter 2022 Q&A webcast. My name is Martin Viecha, VP of investor relations, and I'm joined today by Elon Musk, Zachary Kirkhorn and a number of other executives. Our Q4 results were announced at about 3:00 p.m. Central Time in the update deck we published at the same link as this webcast.

During this call, we will discuss our business outlook and make forward-looking statements. These comments are based on our predictions and expectations as of today. Actual events or result