In [90]:
import sqlite3


In [104]:
with sqlite3.connect("papers.db") as conn:
    c = conn.cursor()
    c.execute("Select arxiv_link from papers")
    
    # Fetch all results
    all_rows = c.fetchall()
    # print(all_rows)
    
    # Check if any rows were fetched
    if all_rows:
        print("Fetched records:")
        for row in all_rows:
            print(row)
    else:
        print("No records found.")

Fetched records:
('https://arxiv.org/pdf/2402.13616v2.pdf',)
('https://arxiv.org/pdf/2402.15151v1.pdf',)
('https://arxiv.org/pdf/2402.03099v1.pdf',)
('https://arxiv.org/pdf/2402.07939v3.pdf',)
('https://arxiv.org/pdf/2402.13144v1.pdf',)
('https://arxiv.org/pdf/2212.09748v2.pdf',)
('https://arxiv.org/pdf/2205.05982v1.pdf',)
('https://arxiv.org/pdf/2402.14652v1.pdf',)
('https://arxiv.org/pdf/2402.04845v1.pdf',)
('https://arxiv.org/pdf/2402.08268v1.pdf',)


In [1]:
from langchain_community.document_loaders import PyPDFLoader
from get_data import get_paper_list

links = get_paper_list()


In [42]:
links[0]

'https://arxiv.org/pdf/2402.13616v2.pdf'

In [None]:
## Create system to save named papers
# import requests

# URL = "https://td-cdn.pw/api.php?download=tikdown.org-42500282235.mp4"
# FILE_TO_SAVE_AS = "myvideo.mp4" # the name you want to save file as


# resp = requests.get(URL) # making requests to server

# with open(FILE_TO_SAVE_AS, "wb") as f: # opening a file handler to create new file 
#     f.write(resp.content) # writing content to file


In [22]:
docs = []
for link in links:
    loader = PyPDFLoader(link)
    docs.extend(loader.load_and_split())

In [26]:
len(docs)

268

# RAG starts here (docs come in)

## Cost of embedding? 

In [24]:
import tiktoken

# Create encoder
encoder = tiktoken.get_encoding('cl100k_base')
tokens_per_docs = [len(encoder.encode(doc.page_content)) for doc in docs]


# Estimated cost = sum of tokens / 1000
cost_per_1000_tokens = 0.0001
cost = (sum(tokens_per_docs) / 1000) * cost_per_1000_tokens
cost


0.017815

In [38]:
# from langchain.retrievers import ParentDocumentRetriever
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [105]:
#Embed and store the texts
# Supplying a persist dicrectory will store the embeddings on disk
persist_direcory = 'data/vectordb'

embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_direcory)


In [106]:
# Persist DB to disk
vectordb.persist()
vectordb = None

In [51]:
# Now can load and use the db
vectordb = Chroma(embedding_function=embeddings, persist_directory=persist_direcory)


In [89]:
vectordb.similarity_search(query= "Large Language Models")

[Document(page_content='pages 358–373.\nJordan Hoffmann, Sebastian Borgeaud, Arthur Mensch,\nElena Buchatskaya, Trevor Cai, Eliza Rutherford,\nDiego de Las Casas, Lisa Anne Hendricks, Johannes\nWelbl, Aidan Clark, et al. 2022. Training compute-\noptimal large language models. ArXiv preprint .\nChristian Kohlschütter, Peter Fankhauser, and Wolfgang\nNejdl. 2010. Boilerplate detection using shallow text\nfeatures. In Proceedings of the Third International\nConference on Web Search and Web Data Mining,\nWSDM 2010, New York, NY, USA, February 4-6,\n2010 , pages 441–450.\nJunlong Li, Yiheng Xu, Lei Cui, and Furu Wei. 2021.\nMarkuplm: Pre-training of text and markup language\nfor visually-rich document understanding. ArXiv\npreprint .\nStephen Merity, Caiming Xiong, James Bradbury, and\nRichard Socher. 2017. Pointer sentinel mixture mod-\nels. In Proceedings of ICLR .', metadata={'page': 4, 'source': 'https://arxiv.org/pdf/2402.14652v1.pdf'}),
 Document(page_content='Pengxiang Jin, Shenglin 

## Make a retriever

In [54]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [56]:
retriever.search_type

'similarity'

In [55]:
docs = retriever.get_relevant_documents("Large Language Models")
len(docs)


3

## Make a chain

In [62]:
from langchain.chains import RetrievalQAWithSourcesChain
# from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
# from langchain.chains.combine_documents import create_stuff_documents_chain


# Create a chain to answer questions

qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm = ChatOpenAI(),
                                       chain_type='stuff',
                                       retriever=retriever,
                                       )

In [85]:
## Cite sources

def process_llm_resoonse(llm_response):
    print(llm_response['answer'])
    print('\n\nSources:')
    # print(llm_response['sources'])
    for source in llm_response['sources'].split(','):
        print(source)


query = "What do these documents say about large language Models"
llm_response = qa_chain.invoke(query)
process_llm_resoonse(llm_response)



These documents discuss training compute-optimal large language models, improving outage understanding with large language models, and using large language models for tasks such as following instructions with human feedback, character role-playing, and more. 



Sources:
https://arxiv.org/pdf/2402.14652v1.pdf
 https://arxiv.org/pdf/2402.07939v3.pdf
 https://arxiv.org/pdf/2402.08268v1.pdf


In [84]:
print(llm_response['sources'].split(','))

['https://arxiv.org/pdf/2402.07939v3.pdf', ' https://arxiv.org/pdf/2402.08268v1.pdf']


In [None]:
from langchain.storage import InMemoryStore

# The storage layer for the parent documents
store = InMemoryStore()
# retriever = ParentDocumentRetriever(
#     vectorstore=vectorstore,
#     docstore=store,
#     child_splitter=child_splitter,
# )