In [18]:
import os
import pickle
from decouple import config, AutoConfig

from langchain import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

# from langchain.embeddings import HuggingFaceEmbeddings


# from app.config.config import OPEN_AI_API_KEY

config = AutoConfig(search_path='.env')

OPEN_AI_API_KEY = config('OPEN_AI_API_KEY')

class EmbeddingsService:
    # CURRENT_DIRECTORY = os.path.dirname(__file__)
    # FILE_PATH_ASSETS = os.path.join(CURRENT_DIRECTORY, "../assets")
    # EMBEDDINGS_PATH = FILE_PATH_ASSETS + '/embeddings/'

    CURRENT_DIRECTORY = '.'
    FILE_PATH_ASSETS = './assets'
    EMBEDDINGS_PATH = FILE_PATH_ASSETS + '/embeddings'

    def get_embedded_file_path(document_name):
        return EmbeddingsService.EMBEDDINGS_PATH + '/' + document_name + '.embedding.pkl'

    def get_raw_file_path(document_name):
        return EmbeddingsService.FILE_PATH_ASSETS + '/raw/' + document_name

    @staticmethod
    def create_embeddings(file_name):
        if file_name == 'ALL':
            # iterate over all files in app/assets/raw, and create embeddings for each
            for file in os.listdir(EmbeddingsService.FILE_PATH_ASSETS + '/raw'):
                EmbeddingsService.create_embeddings_for_file(file)

    @staticmethod
    def create_embeddings_for_file(file_name):
        # check if embedding file already exists.  It would exist in the folder app/assets/embeddings with filename +
        # .embedding.pkl.  If it exists, then skip it.  If it does not exist, then create it.
        embedded_file_path = EmbeddingsService.get_embedded_file_path(file_name)
        if os.path.exists(embedded_file_path):
            print('Embeddings file already exists.  Skipping...' + embedded_file_path)
            return
        else:
            raw_file_path = EmbeddingsService.get_raw_file_path(file_name)
            print('Creating embeddings for file: ' + raw_file_path + ' and saving to: ' + embedded_file_path)
            EmbeddingsService.create_embeddings_and_save(raw_file_path, embedded_file_path)
            print('Embeddings created successfully for: ' + embedded_file_path)
            return

    @staticmethod
    def create_embeddings_and_save(raw_file_path, embedded_file_path):
        print('Creating embeddings...')
        os.environ["OPENAI_API_KEY"] = OPEN_AI_API_KEY
        with open(raw_file_path) as f:
            file_to_split = f.read()
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        texts = text_splitter.split_text(file_to_split)
        embeddings = OpenAIEmbeddings()
        # embeddings = HuggingFaceEmbeddings()

        # Vector store.  Object which stores the embeddings and allows for fast retrieval.
        docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])

        v = [docsearch, texts]

        # save to pickle
        with open(embedded_file_path, 'wb') as f:
            pickle.dump(v, f)

    @staticmethod
    def load_embeddings(document_name):
        os.environ["OPENAI_API_KEY"] = OPEN_AI_API_KEY
        embedded_file_path = EmbeddingsService.get_embedded_file_path(document_name)
        if os.path.exists(embedded_file_path):
            print('Loading embeddings from file...')
            with open(embedded_file_path, 'rb') as f:
                docsearch, texts = pickle.load(f)
        else:
            raise Exception('Embeddings file does not exist.  Please create embeddings file first.')
        return {'docsearch': docsearch, 'texts': texts}

In [19]:
import re

from langchain import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain


class AnswerRetriever:
    def get_answer(self, embeddings, query):
        texts = embeddings['texts']
        docsearch = embeddings['docsearch']
        docs = docsearch.similarity_search(query)
        chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
        answer = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
        sources_indexes = re.findall(r'\d+', answer['output_text'].splitlines()[-1])
        sources_indexes = [int(i) for i in sources_indexes]
        sources_list = []
        for idx in sources_indexes:
            sources_list.append(texts[idx])

        # remove sources from answer
        answer_str = answer['output_text'].split("\nSOURCES:")[0]

        response = {
            'answer': answer_str,
            'sources': sources_list
        }

        # code to load embeddings
        return response

In [12]:
EmbeddingsService.create_embeddings("ALL")

Creating embeddings for file: ./assets/raw/tsla_earnings_transcript_q4_2022.txt and saving to: ./assets/embeddings/tsla_earnings_transcript_q4_2022.txt.embedding.pkl
Creating embeddings...
Embeddings created successfully for: ./assets/embeddings/tsla_earnings_transcript_q4_2022.txt.embedding.pkl


In [14]:
document = 'tsla_earnings_transcript_q4_2022.txt'
embeddings = EmbeddingsService.load_embeddings(document)

Loading embeddings from file...


In [15]:
query = "what are the main points from the call?"

In [16]:
results = AnswerRetriever().get_answer(embeddings, query)

In [17]:
results

{'answer': " The main points from the call include customer interest in Tesla's products remaining high, plans to rapidly increase volume while improving overhead efficiency, and the need to redesign the supply chain to make it more efficient. ",
 'sources': ["These improvements include our continued work to gradually move toward a regionally balanced build of vehicles. The energy business had its strongest year yet across all metrics, led by steady improvement in both retail and commercial storage. While much work remains to grow this business and improve costs, we believe we are on a good trajectory. As we look toward 2023, we are moving forward aggressively leveraging our strength and cost.\n\nThere are three key points I wanted to make here. First, on demand, as Elon mentioned, customer interest in our products remains high. Second, on cost reduction, we're holding steady on our plans to rapidly increase volume while improving overhead efficiency, which is the most effective method