In [1]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken
!pip install chromadb
!pip install pypdf

Collecting langchain
  Downloading langchain-0.0.225-py3-none-any.whl (1.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m18.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.9-py3-none-any.whl (26 kB)
Collecting langchainplus-sdk<0.0.21,>=0.0.20 (from langchain)
  Downloading langchainplus_sdk-0.0.20-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
from google.colab import drive
from typing import List

In [3]:
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = "Your openai api key"

In [4]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
# location of the pdf file/files.
# if you are uploading files from google drive change the folder name in my case "Documents"
reader = PdfReader('/content/gdrive/MyDrive/Documents/2205.08847.pdf')
reader
# another pdf reader
"""from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("your pdf")
pages = loader.load()"""

Mounted at /content/gdrive


<PyPDF2._reader.PdfReader at 0x7f64948723e0>

In [5]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

raw_text[:100]

'GPoeT-2: A GPT-2 Based Poem Generator\nKai-Ling Lo\nSchool of Computer Science\nCarnegie Mellon Univers'

In [6]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [7]:
texts[0]

'GPoeT-2: A GPT-2 Based Poem Generator\nKai-Ling Lo\nSchool of Computer Science\nCarnegie Mellon University\nPittsburgh, PA 15213\nkailingl@cs.cmu.eduRami Ariss\nCivil and Environmental Engineering\nCarnegie Mellon University\nPittsburgh, PA 15213\nrariss@andrew.cmu.edu\nPhilipp Kurz\nSchool of Computer Science\nCarnegie Mellon University\nPittsburgh, PA 15213\nkurz@cmu.edu\nAbstract\nThis project aims to produce the next volume of machine-generated poetry, a\ncomplex art form that can be structured and unstructured, and carries depth in the\nmeaning between the lines. GPoeT-2 is based on ﬁne-tuning a state of the art\nnatural language model (i.e. GPT-2) to generate limericks, typically humorous\nstructured poems consisting of ﬁve lines with a AABBA rhyming scheme. With a\ntwo-stage generation system utilizing both forward and reverse language modeling,\nGPoeT-2 is capable of freely generating limericks in diverse topics while following'

In [8]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [9]:
docsearch = FAISS.from_texts(texts, embeddings)

100%|██████████| 1/1 [00:01<00:00,  1.17s/it]


### simple chain

In [11]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [13]:
query = "Data pre-processing and Encoding steps in format like 1) 2) and so on"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' 1. Insertion of the additional special tokens <BOS> at the beginning of a poem and <LINE> between the lines to help the model better identify the structure of the poems. \n2. Tokenization of poems that follows the GPT-2 format. \n3. For reverse language modeling, reverse the order of tokens of each line in the limericks, while the order of the lines is still maintained.'

### output with memory (memory chain)

In [14]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

In [15]:
# Always say "thanks for asking!" at the end of the answer.
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Keep the answer as concise as possible.


{context}

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")
chain = load_qa_chain(
    OpenAI(temperature=0), chain_type="stuff", memory=memory, prompt=prompt
)


In [16]:
query = "give me Data pre-processing and Encoding steps in format like 1). 2). and so on"
chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

{'output_text': ' 1) Insertion of the additional special tokens <BOS> at the beginning of a poem and <LINE> between the lines to help the model better identify the structure of the poems. 2) Tokenization of poems that follows the GPT-2 format. 3) Reverse the order of tokens of each line in the limericks, while the order of the lines is still maintained.'}

In [17]:
query = "Give me figure 1 title in format Figure 1:'answer here'"
chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

{'output_text': ' Figure 1: Model training and automatic poem generation process for GPoeT-2.'}

In [22]:
query = "Conclusion of the papar in 2 lines and dont be oversmart give me what is written in papar no self writting "
chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

{'output_text': ' The paper concludes that GPoeT-2 is a two-stage generation process that utilizes both forward and reverse language modeling to generate high-quality limerick structures. It also shows that pre-trained language models can be used to generate poems through enforcing a priori constraints.'}

### Cheaking Memory capturing

In [19]:
query = """give me my first query with answer
          """
chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

{'output_text': ' Give me the Data pre-processing and Encoding steps in format like 1). 2). and so on. \nAnswer: 1) Insertion of the additional special tokens <BOS> at the beginning of a poem and <LINE> between the lines to help the model better identify the structure of the poems. 2) Tokenization of poems that follows the GPT-2 format. 3) Reverse the order of tokens of each line in the limericks, while the order of the lines is still maintained.'}

### Full chat till know

In [20]:
print(chain.memory.buffer)

Human: give me Data pre-processing and Encoding steps in format like 1). 2). and so on
AI:  1) Insertion of the additional special tokens <BOS> at the beginning of a poem and <LINE> between the lines to help the model better identify the structure of the poems. 2) Tokenization of poems that follows the GPT-2 format. 3) Reverse the order of tokens of each line in the limericks, while the order of the lines is still maintained.
Human: Give me figure 1 title in format Figure 1:'answer here'
AI:  Figure 1: Model training and automatic poem generation process for GPoeT-2.
Human: Conclusion of the papar in 2 lines 
AI:  The paper concludes that GPoeT-2 is a two-stage generation process that utilizes both forward and reverse language modeling to generate high-quality limerick structures. It also shows that pre-trained language models can be used to generate poems through enforcing a priori constraints.
Human: give me my first query with answer
          
AI:  Give me the Data pre-processing a