In [16]:
# pip install langchain --upgrade
# Version: 0.0.164

# !pip install  
import dotenv
import os
dotenv.load_dotenv()

True

In [17]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [18]:
# loader = PyPDFLoader("../data/field-guide-to-data-science.pdf")

## Other options for loaders 
loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [19]:
data = loader.load()

In [20]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 199527 characters in your document


### Chunk your data up into smaller documents

In [21]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [22]:
print (f'Now you have {len(texts)} documents')

Now you have 139 documents


### Create embeddings of your documents to get ready for semantic search

In [40]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [24]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
# OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')

# PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
# PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env

In [25]:
embeddings = OpenAIEmbeddings()

In [41]:
# initialize pinecone
pinecone.init(
    api_key='f764f151-d411-470c-aa42-4f9ce7787cb6',  # find at app.pinecone.io
    environment='us-west4-gcp-free'  # next to api key in console
)
index_name = "langchain1" # put in the name of your pinecone index here

In [30]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)
# docsearch = Pinecone.from_texts(['hello'], embeddings, index_name=index_name)

In [31]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)

In [32]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

Data Science teams need a broad view of the organization. Leaders must be key advocates who meet with stakeholders to ferret out the hardest challenges, locate the data, connect disparate parts of the business, and gain widespread buy-in.

The Short Version

17 17

AN INTRODUCTION TO DATA SCIENCEIf you haven’t heard of Data Science, you’re behind the times. Just renaming your Business Intelligence group the Data Science group is not the solution.


### Query those docs to get your answer back

In [35]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [36]:
llm = OpenAI(temperature=0)
chain = load_qa_chain(llm, chain_type="stuff")

In [37]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query)

In [38]:
chain.run(input_documents=docs, question=query)

' The collect stage of data maturity focuses on collecting internal or external datasets. An example of this stage is gathering sales records and corresponding weather data.'

In [39]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Good data science teams have a diverse set of skills, including computer science, mathematics, and domain expertise. They should also have a balance between complex and easy projects, and look for opportunities to combine disparate datasets.'

In [43]:
# index = pinecone.Index(index_name)
# index.delete(delete_all=True)

{}