In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [2]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [3]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('leave.pdf')

In [4]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [5]:
raw_text

'F.R. & S.R. – PART III \n \nCENTRAL CIVIL SERVICES \n \nLEAVE RULES CENTRAL CIVIL SERVICES \n \nLEAVE RULES  \n \nCHAPTER I \n \nPreliminary  \n \n1. Short title and commencement    \n(1) These rules may be called the Central Civil Services (Leave) \nRules, 1972. \n \n (2)  They  shall come into force on the 1\nst day of June, 1972. \n \n2. Extent of application  \n \n Save as otherwise provided in thes e rules, these rules shall apply to \nGovernment servants appointed to the civil services and posts in \nconnection with the affairs of th e Union, but shall not apply to- \n \n(a) Railway servants; (b) persons in casual or daily -rated or part-time employment; \n(c) persons paid from contingencies; \n(d) workmen employed in industrial establishments; (e) persons employed in work-charged establishments; \n(f) members of the All India Services; \n(g) persons locally recruited for se rvice in Diplomatic, Consular or \nother Indian establishments in foreign countries; \n(h) persons employ

In [6]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [7]:
len(texts)

185

In [8]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [9]:
document_search = FAISS.from_texts(texts, embeddings)

In [10]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [11]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [12]:
query = "Regulation of claim to leave"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' A Government servant’s claim to leave is regulated by the rules in force at the time the leave is applied for and granted.'

In [13]:
query = "Acceptance of service or employment while on leave"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' A Government servant (other than a Government servant who has been permitted a limited amount of private practice or who has been permitted to undertake casual literary work or service as an examiner or similar employment) while on leave, including leave preparatory to retirement, shall not take up any service or employment elsewhere, including the setting up of a private professional practice as accountant, consultant or legal or medical practitioner, without obtaining the previous sanction of the President, if the proposed services or employment lies elsewhere than in India, or the authority empowered to appoint him, if the proposed service or employment lies in India.'

In [None]:
from langchain.document_loaders import OnlinePDFLoader

In [None]:
loader = OnlinePDFLoader("https://arxiv.org/pdf/1706.03762.pdf")

In [None]:
!pip install unstructured

In [None]:
data = loader.load()

In [None]:
data

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [31]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.6-py3-none-any.whl (405 kB)
     -------------------------------------- 405.5/405.5 kB 1.6 MB/s eta 0:00:00
Collecting tqdm>=4.65.0
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.3/78.3 kB 2.2 MB/s eta 0:00:00
Collecting pulsar-client>=3.1.0
  Downloading pulsar_client-3.2.0-cp310-cp310-win_amd64.whl (3.4 MB)
     ---------------------------------------- 3.4/3.4 MB 4.4 MB/s eta 0:00:00
Collecting tokenizers>=0.13.2
  Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)
Collecting onnxruntime>=1.14.1
  Downloading onnxruntime-1.15.1-cp310-cp310-win_amd64.whl (6.7 MB)
     ---------------------------------------- 6.7/6.7 MB 2.5 MB/s eta 0:00:00
Collecting typing-extensions>=4.5.0
  Using cached typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Collecting fastapi<0.100.0,>=0.95.2
  Downloading fastapi-0.99.1-py3-none-any.whl (58 kB)
     ---------------------------------------- 

  error: subprocess-exited-with-error
  
  Building wheel for chroma-hnswlib (pyproject.toml) did not run successfully.
  exit code: 1
  
  [5 lines of output]
  running bdist_wheel
  running build
  running build_ext
  building 'hnswlib' extension
  error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for chroma-hnswlib
ERROR: Could not build wheels for chroma-hnswlib, which is required to install pyproject.toml-based projects


In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

In [None]:
query = "Explain me about Attention is all you need"
index.query(query)