In [None]:
!pip install openai
!pip install --upgrade langchain
!pip install unstructured
!pip install tiktoken
!pip install pinecone-client
!pip install pypdf
!pip install huggingface_hub
!pip install sentence-transformers

In [5]:
import openai
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader #Load
from langchain.text_splitter import RecursiveCharacterTextSplitter # Transform - Chunking
from langchain.embeddings import OpenAIEmbeddings # Embed from OpenAI
from langchain.embeddings import SentenceTransformerEmbeddings # Embed from Hugging face
from langchain.vectorstores import Chroma #Store
from langchain.vectorstores import Pinecone #Store

  from tqdm.autonotebook import tqdm


# Data Connections
Steps: Load Data -> Transform -> Text Embedding -> Store in Vector Store -> Retrieve

In [7]:
#Load the documents
def load_pdf(pdf_path):
    loader = PyPDFDirectoryLoader(pdf_path)
    return loader.load()

#Transform - chunking
def split_docs(documents,chunk_size=1000, chunk_overlap=10):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(documents)

#Text Embedding and Pinecone vector store
def get_store(docs,embedding):
  pinecone.init(
    api_key="c7908252-9558-4cb3-8efb-e44f4121c89a",
    environment="gcp-starter"
  )
  index = "mcq-creator"
  index = Pinecone.from_documents(docs, embedding, index_name=index)
  return index

#Shortlist documents similar to the question from vectorstore
def getRelevantDocuments(query,db,k=2):
  result = db.similarity_search(query,k=k)
  return result

In [12]:
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain

llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
#OpeanAI()
chain = load_qa_chain(llm=llm, chain_type="stuff")
documents = load_pdf('sample_data/')
texts = split_docs(documents)
embedding = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
db = get_store(texts,embedding)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [34]:
def getFinalAnswer(query):
  docs = getRelevantDocuments(query,db)
  #print(docs)
  ans = chain.run(input_documents=docs,question = query)
  return ans

In [35]:
response = getFinalAnswer("How is India's economy?")
print(response)

 India's economy is a mixed economy, with a large public sector and a growing private sector. The government


# Format the output and generate Q&A

In [15]:
import re
import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [20]:
response_schemas=[
    ResponseSchema(name="question", description="Multi-choice Question generated from provided input text data"),
    ResponseSchema(name="choices", description="Available options, for a multi-choice question, in coma seperated format"),
    ResponseSchema(name="answer", description="Correct answer for the asked question")
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()
chat_model = ChatOpenAI()

In [36]:
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""When a text input is given by the user, please generate multiple choice questions
        from it along with the correct answer.
        \n{format_instructions}\n{user_prompt}""")
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions": format_instructions}
)
msg = prompt.format_prompt(user_prompt=response).to_messages()

In [37]:
final_output = chat_model(msg)
final_output

AIMessage(content='```json\n{\n\t"question": "What type of economy does India have?",\n\t"choices": "A. Mixed economy, B. Command economy, C. Market economy, D. Traditional economy",\n\t"answer": "A. Mixed economy"\n}\n```', additional_kwargs={}, example=False)

In [38]:
# Let's extract JSON data from Markdown text that we have
markdown_text = final_output.content
json_string = re.search(r'{(.*?)}', markdown_text, re.DOTALL).group(1)
print(json_string)


	"question": "What type of economy does India have?",
	"choices": "A. Mixed economy, B. Command economy, C. Market economy, D. Traditional economy",
	"answer": "A. Mixed economy"

