In [3]:
# Import necessary libraries
import openai
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import AzureOpenAI

In [4]:
# Configure the baseline configuration of the OpenAI library for Azure OpenAI Service.
OPENAI_API_KEY = "PLEASE_ENTER_YOUR_OWNED_AOAI_SERVICE_KEY"
OPENAI_DEPLOYMENT_NAME = "PLEASE_ENTER_YOUR_OWNED_AOAI_TEXT_MODEL_NAME"
OPENAI_EMBEDDING_MODEL_NAME = "PLEASE_ENTER_YOUR_OWNED_AOAI_EMBEDDING_MODEL_NAME"
MODEL_NAME = "text-davinci-003"
openai.api_type = "azure"
openai.api_base = "https://PLESAE_ENTER_YOUR_OWNED_AOAI_RESOURCE_NAME.openai.azure.com/"
openai.api_version = "2022-12-01"
openai.api_key = "PLEASE_ENTER_YOUR_OWNED_AOAI_SERVICE_KEY"

In [5]:
# Configure the location of the PDF file.
pdfReader = PdfReader('./data/machine_learning_yearning_by_andrew_ng.pdf')

In [6]:
# Extract the text from the PDF file.
raw_text = ''
for i, page in enumerate(pdfReader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [7]:
# Show first 1000 characters of the text.
raw_text[:1000]

' \n \n \n \n \n \n \nMachine Learning Yearning is a\n \n \ndeeplearning.ai project.\n \n \n \n \n \n \n \n \n \n \n \n© 2018 Andrew Ng. All Rights Reserved.\n \n \n \n \nPage 2\nMachine Learning Yearning-Draft\nAndrew Ng\n  \nTable of Contents\n \n \n1 Why Machine Learning Strategy\n \n2 How to use this book to help your team\n \n3 Prerequisites and Notation\n \n4 Scale drives machine learning progress\n \n5 Your development and test sets\n \n6 Your dev and test sets should come from the same distribution\n \n7 How large do the dev/test sets need to be?\n \n8 Establish a single-number evaluation metric for your team to optimize\n \n9 Optimizing and satisficing metrics\n \n10 Having a dev set and metric speeds up iterations\n \n11 When to change dev/test sets and metrics\n \n12 Takeaways: Setting up development and test sets\n \n13 Build your first system quickly, then iterate\n \n14 Error analysis: Look at dev set examples to evaluate ideas\n \n15 Evaluating multiple ideas in parallel

In [8]:
# Split the text into chunks of 1000 characters with 200 characters overlap.
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
pdfTexts = text_splitter.split_text(raw_text)

In [9]:
# Show how many chunks of text are generated.
len(pdfTexts)

204

In [10]:
# Pass the text chunks to the Embedding Model from Azure OpenAI API to generate embeddings.
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, deployment=OPENAI_EMBEDDING_MODEL_NAME, client="azure", chunk_size=1)

In [11]:
# Use FAISS to index the embeddings. This will allow us to perform a similarity search on the texts using the embeddings.
# https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/faiss.html
pdfDocSearch = FAISS.from_texts(pdfTexts, embeddings)

In [12]:
# Create a Question Answering chain using the embeddings and the similarity search.
# https://docs.langchain.com/docs/components/chains/index_related_chains
chain = load_qa_chain(AzureOpenAI(openai_api_key=OPENAI_API_KEY, deployment_name=OPENAI_DEPLOYMENT_NAME, model_name=MODEL_NAME), chain_type="stuff")

In [20]:
# Perform first sample of question answering.
inquiry = "Who is the author of this book?"
docs = pdfDocSearch.similarity_search(inquiry)
chain.run(input_documents=docs, question=inquiry)

' The author of this book is Andrew Ng.'

In [22]:
# Perform second sample of question answering.
inquiry = "Please tell me the key summary of this book."
docs = pdfDocSearch.similarity_search(inquiry)
chain.run(input_documents=docs, question=inquiry)

' This book is designed to help the reader become the superhero of their machine learning team. It provides short chapters that can be printed out and shared with teammates to help them understand the technical direction and priorities of the project. It also provides advice on how to define a single-number evaluation metric, how to decide whether to use all available data, how to identify bias, variance, and data mismatch errors, and more.'

In [24]:
# Ask user for input and perform question answering.
inquiry = input('Please enter search terms: ')
print('You entered: ' + inquiry)
docs = pdfDocSearch.similarity_search(inquiry)
chain.run(input_documents=docs, question=inquiry)

You entered: What is machine learning according to this book?


' According to this book, machine learning is the process of building computer systems that can learn from data, improve with experience, and make predictions about the future.'