In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

  from tqdm.autonotebook import tqdm


# Loading and converting the data into smaller docs

In [2]:
loader = PyPDFLoader("/Users/anupam/Documents/Debtanaya docs/National_Education_Policy_2020.pdf")
data = loader.load()
data

[Document(page_content='1 \n \n \nY  \n \n \n \n \nNational  Education   \nPolicy  2020  \n \n \n \nMinistry  of Human  \nResource  Development  \n \nGovernment  of India  \n \n', metadata={'source': '/Users/anupam/Documents/Debtanaya docs/National_Education_Policy_2020.pdf', 'page': 0}),
 Document(page_content='1 \n  \nChapter   Contents  Page  \nNo \n Introduction  3 \n PART   I.  SCHOOL  EDUCATION  \n1  \nEarly  Childhood  Care  and Education:  The Foundation  of Learning   7 \n2 Foundational  Literacy  and Numeracy:  An Urgent  & Necessary  \nPrerequisite  to Learning  8 \n3 Curtailing  Dropout  Rates  and Ensuring  Universal  Access  to Education  at \nAll Levels   10 \n4 Curriculum  and Pedagogy  in Schools:  Learning  Should  be Holistic,  \nIntegrated,  Enjoyable  and Engaging  11 \n5 Teachers  20 \n6 Equitable  and Inclusive  Educa 4tion: Learning  for All 24 \n7 Efficient  Resourcing  and Effective  Governance  through  School  \nComplexes/Clusters  28 \n8 Standard -setting  

In [3]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 66 document(s) in your data
There are 4990 characters in your document


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [5]:
print (f'Now you have {len(texts)} documents')

Now you have 186 documents


In [6]:
texts[4]

Document(page_content='with multidisciplinary  abilities  across  the scienc es, social  sciences,  and humanities,  will be \nincreasingly  in greater  demand.  With  climate  change,  increasing  pollution,  and depleting  natural  \nresources,  there  will be a sizeable  shift in how we meet  the world ’s energy,  water,  food,  and \nsanitation  needs,  again resulting  in the need  for new skille d labour,  particularly  in biology,  chemistry,  \nphysics,  agriculture,  climate  science,  and social  science.  The growing  emergence  of epidemics  and \npandemics  will also call for collaborative  research  in infectious  diseas e management  and \ndevelopment  of vaccines  and the resultant  social  issues  heightens  the need  for multidisciplinary  \nlearning.  There  will be a growing  demand  for humanities  and art, as India  moves  towards  becoming  a \ndeveloped  country  as well as among  the three largest  economies  in the world.  \nIndeed , with the quickly  changing

# Create embeddings of your documents to get ready for semantic search

In [8]:
PINECONE_API_KEY=os.environ.get("PINECONE_API_KEY")
PINECONE_API_ENV=os.environ.get("PINECONE_ENV")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.environ.get("HF_KEY")
# llm=HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":1e-10})


In [33]:
model_name = "sentence-transformers/sentence-t5-xl"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


Downloading (…)4fd22/.gitattributes: 1.18kB [00:00, 1.57MB/s]

Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 50.3kB/s]

Downloading (…)/2_Dense/config.json: 100%|██████████| 116/116 [00:00<00:00, 63.0kB/s]

[A
Downloading pytorch_model.bin: 100%|██████████| 3.15M/3.15M [00:07<00:00, 411kB/s]

Downloading (…)928c44fd22/README.md: 2.00kB [00:00, 2.56MB/s]

Downloading (…)8c44fd22/config.json: 1.39kB [00:00, 2.07MB/s]

Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 68.1kB/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


In [34]:
hf_embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: T5EncoderModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (3): Normalize()
), model_name='sentence-transformers/sentence-t5-xl', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': False})

In [35]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "nep-hf" # put in the name of your pinecone index here

In [36]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], hf_embeddings, index_name=index_name)

In [19]:
query = "How to curtail dropout rates?"
docs = docsearch.similarity_search(query, k=3)

In [21]:
docs

[Document(page_content='National  Education  Policy  2020  \n10 \n health  check -ups especially  for 100%  immunization  in schools  and health  cards  will be issued  to \nmonitor  the same . \n3. Curtailing  Dropout  Rates  and Ensuring  Universal  Access  to Education  at All Levels  \n3.1. One of the primary  goals  of the schooling  system  must  be to ensure  that children  are enrolled  in \nand are attending  school.  Through  initiatives  such as the Sarva  Shiksha  Abhiyan  (now  the Samagra  \nShiksha ) and the Right  to Education  Act, India  has made  remarkable  strides  in recent years  in \nattaining  near-universal  enrolment  in elementary  education.  However,  the data for later grades  \nindicates  some  serious  issues  in retaining  children  in the schooling  system . The GER  for Grades  6-8 \nwas 90.9%,  while  for Grades  9-10 and 11-12 it was only 79.3%  and 56.5%,  respectively  - indicating  \nthat a significant  proportion  of enrolled  students  drop ou

In [20]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

National  Education  Policy  2020  
10 
 health  check -ups especially  for 100%  immunization  in schools  and health  cards  will be issued  to 
monitor  the same . 
3. Curtailing  Dropout  Rates  and Ensuring  Universal  Access  to Education  at All Levels  
3.1. One of the primary  goals  of the schooling  system  must  be to ensure  that children  are enrolled  in 
and are attending  school.  Through  initiatives  such as the Sarva  Shiksha 


# Query those docs to get your answer back

In [22]:
# from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [37]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":1e-10})

In [38]:
chain = load_qa_chain(llm, chain_type="stuff")

In [40]:
query = "How to have more capable faculty in schools and colleges? Give a detailed answer"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

ValueError: Error raised by inference API: Model google/flan-t5-xl time out