In [36]:
import os
from environs import Env

env = Env()
env.read_env(".env") # read .env file, if it exists

import openai
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import re
import json
from langchain_openai.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

https://ptot.texas.gov/ot-acts-and-rules/

In [2]:
# Function to read documents
def load_docs(directory):
  loader = PyPDFDirectoryLoader(directory)
  documents = loader.load()
  return documents

# Passing the directory to the 'load_docs' function
directory = 'docs'
documents = load_docs(directory)
len(documents)

21

In [3]:
# This function will split the documents into chunks
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

76


In [4]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Let's test our Embeddings model for a sample text
query_result = embeddings.embed_query("Hello, World")
len(query_result)

query_result

[-0.05609435588121414,
 0.035542551428079605,
 0.004592898767441511,
 0.023865869268774986,
 -0.049443669617176056,
 -0.15514129400253296,
 0.06592562049627304,
 0.02249128185212612,
 -0.021727310493588448,
 0.014119233936071396,
 0.05505148693919182,
 0.024055711925029755,
 0.0050191013142466545,
 -0.00647747004404664,
 -0.03411562368273735,
 -0.05552099272608757,
 -0.006752687506377697,
 -0.023014012724161148,
 -0.17627856135368347,
 -0.02309204451739788,
 1.4255781934480183e-05,
 0.07931108772754669,
 -0.012627905234694481,
 0.03713007643818855,
 -0.09230007976293564,
 -0.023067831993103027,
 0.06069957837462425,
 0.051330260932445526,
 -0.02947750687599182,
 -0.037245482206344604,
 0.03728852421045303,
 0.05159962177276611,
 0.0963367372751236,
 -0.009374146349728107,
 -0.013310291804373264,
 0.0866255909204483,
 -0.08137482404708862,
 -0.06393185257911682,
 0.0056324009783566,
 0.018668053671717644,
 0.05008462443947792,
 -0.07173816859722137,
 -0.055224593728780746,
 -0.046738833

In [9]:
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENVIRONMENT")
)

index_name = "mcq-creator"

index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [10]:
# This function will help us in fetching the top relevent documents from our vector store - Pinecone
def get_similar_docs(query, k=2):
    similar_docs = index.similarity_search(query, k=k)
    return similar_docs

In [15]:
llm=HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
llm



HuggingFaceHub(client=InferenceAPI(api_url='https://api-inference.huggingface.co/pipeline/text-generation/bigscience/bloom', task='text-generation', options={'wait_for_model': True, 'use_gpu': False}), repo_id='bigscience/bloom', model_kwargs={'temperature': 1e-10})

In [20]:
chain = load_qa_chain(llm, chain_type="stuff")

# This function will help us get the answer to the question that we raise
def get_answer(query):
  relevant_docs = get_similar_docs(query)
  print(relevant_docs)
  response = chain.run(input_documents=relevant_docs, question=query)
  return response

In [39]:
# Let's pass our question to the above created function
our_query = "What is the title of this document?"
answer = get_answer(our_query)
print(answer)

[Document(page_content='Acts 1999, 76th Leg., ch. 388, Sec. 1, eff. Sept. 1, 1999.  Amended by Acts 2003, 78th Leg., ch. 1112, Sec. 3.08, eff. \nSept. 1, 2003.  \nAmended by:  \nActs 2007, 80th Leg., R.S., Ch. 928 (H.B. 3249 ), Sec. 4.07, eff. June 15, 2007.  \nActs 2011, 82nd Leg., R.S., Ch. 1232 (S.B. 652), Sec. 3.09, eff. June 17, 2011.  \nActs 2017, 85th Leg., R.S., Ch. 535 (S.B. 317), Sec. 4.01, eff. September 1, 2017.  \nActs 2023, 88th Leg., R.S., C h. 941  (S.B. 1659 ), Sec. 2.09, eff. June 18 , 2023.  \n \nSec. 454.004.   CONFLICT WITH OTHER LAW.   To the extent of any conflict between this chapter and \nChapter 452, Chapter 452 controls.  \n \nActs 1999, 76th Leg., ch. 388, Sec. 1, eff. Sept. 1, 1999.  \n \nSec. 454.005.   APPLICABILITY.   (a)  This chapter does not apply to a holder of a lice nse issued by another', metadata={'page': 4.0, 'source': 'docs\\OT-Practice-Act-Sept-2023.pdf'}), Document(page_content='Texas Occupational Therapy Practice Act                         

In [40]:
response_schemas = [
    ResponseSchema(name="question", description="Question generated from provided input text data."),
    ResponseSchema(name="choices", description="Available options for a multiple-choice question in comma separated."),
    ResponseSchema(name="answer", description="Correct answer for the asked question.")
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
output_parser

StructuredOutputParser(response_schemas=[ResponseSchema(name='question', description='Question generated from provided input text data.', type='string'), ResponseSchema(name='choices', description='Available options for a multiple-choice question in comma separated.', type='string'), ResponseSchema(name='answer', description='Correct answer for the asked question.', type='string')])

In [41]:
# This helps us fetch the instructions the langchain creates to fetch the response in desired format
format_instructions = output_parser.get_format_instructions()
 
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"question": string  // Question generated from provided input text data.
	"choices": string  // Available options for a multiple-choice question in comma separated.
	"answer": string  // Correct answer for the asked question.
}
```


In [43]:
# Create ChatGPT object
chat_model = ChatOpenAI()

# The below snippet will give out a string that contains instructions for how the response should be formatted, and we then insert that into our prompt.

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""When a text input is given by the user, please generate 4 multiple choice questions 
        from it along with the correct answer. 
        \n{format_instructions}\n{user_prompt}""")  
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions": format_instructions}
)
final_query = prompt.format_prompt(user_prompt = answer)
print(final_query)

messages=[HumanMessage(content='When a text input is given by the user, please generate 4 multiple choice questions \n        from it along with the correct answer. \n        \nThe output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data.\n\t"choices": string  // Available options for a multiple-choice question in comma separated.\n\t"answer": string  // Correct answer for the asked question.\n}\n```\n The Texas Occupational Therapy Practice Act\nCorrect Answer: The Texas Occupational Therapy Practice Act\n\nA:')]
