In [None]:
!pip install -qU pinecone-client openai langchain unstructured pdf2image tiktoken

In [19]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

Load Data. Save the .pdf in the Google drive and load it from there. Pair your Colab env with Google drive

In [20]:
loader = UnstructuredPDFLoader("/content/drive/MyDrive/generative-ai/tesla-ar-2021.pdf")
data = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [21]:
print(f'You have {len(data)} document(s) in your data')
print(f'There are {len(data[0].page_content)} charactors in the document')

You have 1 document(s) in your data
There are 455681 charactors in the document


Chunk the document into small documents

In [22]:
text_spiltter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_spiltter.split_documents(data)
print(f'Now you have {len(texts)} documents')

Now you have 613 documents


In [23]:
texts[0]

Document(page_content='UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K\n\n(Mark One) ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES\n\nEXCHANGE ACT OF 1934\n\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES\n\nEXCHANGE ACT OF 1934\n\nFor the fiscal year ended December 31, 2021 OR\n\nFor the transition period from _________ to _________ Commission File Number: 001-34756\n\nTesla, Inc.\n\n(Exact name of registrant as specified in its charter)\n\nDelaware (State or other jurisdiction of incorporation or organization)\n\n13101 Tesla Road Austin, Texas (Address of principal executive offices)\n\n91-2197729 (I.R.S. Employer Identification No.)\n\n78725 (Zip Code)\n\n(512) 516-8177 (Registrant’s telephone number, including area code)\n\nSecurities registered pursuant to Section 12(b) of the Act:\n\nTitle of each class Common stock\n\nTrading Symbol(s) TSLA\n\nName of each exchange on which registered The Nasdaq Global 

Create embeddings for sementic search

In [25]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import openai

In [32]:
openai.api_key = ""
embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
index_name = 'semantic-search-openai'

In [33]:
pinecone.init(
    api_key="",
    environment=""  # find next to api key in console
)
# check if 'openai' index already exists (only create index if not)
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536)
# connect to index
index = pinecone.Index(index_name)

Saving all those documents to the Pinecone Index!

In [36]:
docsearch = Pinecone.from_texts([t.page_content for t in texts],embeddings, index_name=index_name )

Returns the 5 documents with the most probability to the query

In [None]:
query_1 = "what was the initial public offering price per share for Tesla?"
docs = docsearch.similarity_search(query_1)

In [None]:
docs

Query the docs to get the answer back

In [42]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
llm = OpenAI(openai_api_key=openai.api_key)
chain = load_qa_chain(llm, chain_type="stuff")

In [43]:
result_docs = docsearch.similarity_search(query_1)
chain.run(input_documents=result_docs, question=query_1)

' $3.40 per share.'

In [44]:
query_2 = "Which factories produce Model X for Tesla?"
result_docs_2 = docsearch.similarity_search(query_2)
chain.run(input_documents=result_docs_2, question=query_2)

' The Fremont Factory produces Model X for Tesla.'

In [46]:
query_3 = "Which factories produce Tesla Roadster?"
result_docs_3 = docsearch.similarity_search(query_3)
chain.run(input_documents=result_docs_3, question=query_3)

' The Tesla Roadster is not produced at any of the factories listed.'

In [47]:
query_4 = "What was the total revenues for Tesla for 2021? "
result_docs_4 = docsearch.similarity_search(query_4)
chain.run(input_documents=result_docs_4, question=query_4)

' 52,148 million.'

In [49]:
query_5 = "What was the percentage increase in Tesla's revenues in 2021 compared to 2020?"
result_docs_5 = docsearch.similarity_search(query_5)
chain.run(input_documents=result_docs_5, question=query_5)

' 79%'

In [51]:
query_6 = "What are the reasons for increase in Tesla's revenues in 2021 compared to 2020? Please provide the output in point form"
result_docs_6 = docsearch.similarity_search(query_6)
chain.run(input_documents=result_docs_6, question=query_6)

'\n\n- Increase of 433,815 Model 3 and Model Y cash deliveries\n- Increase in cumulative vehicles under direct operating lease program\n- Increase in direct sales-type leasing cost of revenues\n- Increase in used vehicle cost of revenue\n- Increase in costs to support non-warranty maintenance services revenue\n- Increase in costs of retail merchandise\n- Net release of sales return reserve on vehicles sold with resale value guarantees\n- Decrease in combined average Model 3 and Model Y costs per unit'

In [52]:
query_7 = "How much tax was paid by Tesla for 2021?"
result_docs_7 = docsearch.similarity_search(query_7)
chain.run(input_documents=result_docs_7, question=query_7)

' Tesla does not provide a specific figure for the amount of taxes paid in 2021. The information provided is related to multi-year sales tax exclusions and incentives related to the expansion and ongoing development of electric vehicles and powertrain production in California.'

Finally delete the Pinecone index (Optional)

In [None]:
# pinecone.delete_index('semantic-search-openai')