In [7]:
import os
import pandas as pd
from transformers import GPT2TokenizerFast
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

from dotenv import load_dotenv
load_dotenv()

True

In [8]:
import textract

doc = textract.process("./Arquivo.pdf")

with open('Arquivo.txt', 'w', encoding="UTF-8") as f:
    f.write(doc.decode('utf-8'))

with open('Arquivo.txt', 'r', encoding="UTF-8") as f:
    text = f.read()

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text):
    return len(tokenizer.encode(text))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap = 24,
    length_function = count_tokens,
) 

chunks = text_splitter.create_documents([text])

Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 3.02MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.37MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.41MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 495kB/s]


In [9]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

db = FAISS.from_documents(chunks, embeddings)

In [14]:
query = "best practices to write function in javascript?"
docs = db.similarity_search(query)
docs[0]

Document(page_content='JAVASCRIPT:\nBEST PRACTICE\n\nCLEAN, MAINTAINABLE, PERFORMANT CODE\n\n\x0cii\n\nJavaScript: Best Practice\n\nJavaScript: Best Practice\nCopyright © 2018 SitePoint Pty. Ltd.\n\nProduct Manager: Simon Mackie\nEnglish Editor: Ralph Mason\nProject Editor: James Hibbard\nCover Designer: Alex Walker\n\nNotice of Rights\nAll rights reserved. No part of this book may be reproduced, stored in a retrieval\nsystem or transmitted in any form or by any means, without the prior written\npermission of the publisher, except in the case of brief quotations embodied in\ncritical articles or reviews.\n\nNotice of Liability\nThe author and publisher have made every effort to ensure the accuracy of the\ninformation herein. However, the information contained in this book is sold\nwithout warranty, either express or implied. Neither the authors and SitePoint\nPty. Ltd., nor its dealers or distributors will be held liable for any damages to be\ncaused either directly or indirectly by th

In [15]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

chain.run(input_documents=docs, question=query)

' The best practices for writing functions in JavaScript include using const to define variables that cannot be rebound to new values, using arrow functions to provide a cleaner syntax for declaring anonymous functions, and using improved class syntax to emulate classical object-orientation with prototypes.'