#### **Pip Install**

In [1]:
%%capture
!pip install langchain
!pip install chromadb
!pip install openai
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
#!pip install gpt_index==0.4.24
!pip install PyPDF2
!pip install PyCryptodome
!pip install gradio
!pip install tiktoken
!pip install faiss-cpu
!pip install python-magic
!pip install config

In [2]:
openai_api_key='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'

### **Cookbook**

##### **Chat**

In [3]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage

chat = ChatOpenAI(temperature=.2, openai_api_key=openai_api_key)

In [4]:
chat(
    [
        SystemMessage(content="You are a nice AI bot that helps a user figure out what to eat in one short sentence"),
        HumanMessage(content="I like tomatoes, what should I eat?")
    ]
)

AIMessage(content='You could try a caprese salad with fresh tomatoes, mozzarella, and basil.', additional_kwargs={})

##### **Documents**

In [5]:
from langchain.schema import Document

In [6]:
Document(page_content="This is my document. It is full of text that I've gathered from other places",
         metadata={
             'my_document_id' : 234234,
             'my_document_source' : "The LangChain Papers",
             'my_document_create_time' : 1680013019
         })

Document(page_content="This is my document. It is full of text that I've gathered from other places", metadata={'my_document_id': 234234, 'my_document_source': 'The LangChain Papers', 'my_document_create_time': 1680013019})

##### **Language Model**

In [7]:
from langchain.llms import OpenAI

llm = OpenAI(model_name="text-ada-001", openai_api_key=openai_api_key)

In [8]:
llm("What day comes after Friday?")

'\n\nSaturday.'

##### **Text Embedding Model**

In [9]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [10]:
text = "Hi! It's time for the beach"

##### **Prompt**

In [11]:
from langchain.llms import OpenAI

llm = OpenAI(model_name="text-davinci-003", openai_api_key=openai_api_key)

# I like to use three double quotation marks for my prompts because it's easier to read
prompt = """
Today is Monday, tomorrow is Wednesday.

What is wrong with that statement?
"""

llm(prompt)

'\nThe statement is incorrect; tomorrow is Tuesday, not Wednesday.'

##### **Prompt Template**

In [12]:
from langchain.llms import OpenAI
from langchain import PromptTemplate

llm = OpenAI(model_name="text-davinci-003", openai_api_key=openai_api_key)

# Notice "location" below, that is a placeholder for another value later
template = """
I really want to travel to {location}. What should I do there?

Respond in one short sentence
"""

prompt = PromptTemplate(
    input_variables=["location"],
    template=template,
)

final_prompt = prompt.format(location='Rome')

print (f"Final Prompt: {final_prompt}")
print ("-----------")
print (f"LLM Output: {llm(final_prompt)}")

Final Prompt: 
I really want to travel to Rome. What should I do there?

Respond in one short sentence

-----------
LLM Output: Take in the historic sights, explore the ruins, and enjoy some delicious Italian cuisine.


##### **Document Loaders**

In [13]:
from langchain.document_loaders import HNLoader

In [14]:
loader = HNLoader("https://news.ycombinator.com/item?id=34422627")

In [15]:
data = loader.load()

In [16]:
print (f"Found {len(data)} comments")
print (f"Here's a sample:\n\n{''.join([x.page_content[:150] for x in data[:2]])}")

Found 76 comments
Here's a sample:

Ozzie_osman 84 days ago  
             | next [–] 

LangChain is awesome. For people not sure what it's doing, large language models (LLMs) are very pOzzie_osman 84 days ago  
             | parent | next [–] 

Also, another library to check out is GPT Index (https://github.com/jerryjliu/gpt_index) 


##### **Text Splitter**

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [24]:
# This is a long document we can split up.
with open('C:/Users/ferna/OneDrive/Documents/Morada Uno_tech/paul_graham_essay.txt') as f:
    pg_work = f.read()
    
print (f"You have {len([pg_work])} document")

You have 1 document


In [25]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 150,
    chunk_overlap  = 20,
)

texts = text_splitter.create_documents([pg_work])

In [26]:
print (f"You have {len(texts)} documents")
"---------------------------------------"
print ("Preview:")
print (texts[0].page_content, "\n")
print (texts[1].page_content)

You have 27317 documents
Preview:
January 2023 

<i>(<a href="https://twitter.com/stef/status/1617222428727586816"><u>Someone</u></a> fed my essays into GPT to make something that could answer


##### **Retrievers**

In [28]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

loader = TextLoader('C:/Users/ferna/OneDrive/Documents/Morada Uno_tech/paul_graham_essay.txt')
documents = loader.load()

In [29]:
# Get your splitter ready
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

# Split your docs into texts
texts = text_splitter.split_documents(documents)

# Get embedding engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Embedd your texts
db = FAISS.from_documents(texts, embeddings)

In [30]:
# Init your retriever. Asking for just 1 document back
retriever = db.as_retriever()

In [31]:
retriever

VectorStoreRetriever(vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x000002519EC250C0>, search_type='similarity', search_kwargs={})

In [32]:
docs = retriever.get_relevant_documents("what types of things did the author want to build?")

In [33]:
print("\n\n".join([x.page_content[:200] for x in docs[:2]]))

because users were desperately waiting for what they were building.

Instead of telling kids that their treehouses could be on the path
to the work they do as adults, we tell them the path goes through
school. And unfortunately schoolwork tends to be very different fro


##### **Vector Stores**

In [35]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

loader = TextLoader('C:/Users/ferna/OneDrive/Documents/Morada Uno_tech/paul_graham_essay.txt')
documents = loader.load()

# Get your splitter ready
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

# Split your docs into texts
texts = text_splitter.split_documents(documents)

# Get embedding engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [36]:
print (f"You have {len(texts)} documents")

You have 4063 documents


In [37]:
embedding_list = embeddings.embed_documents([text.page_content for text in texts])

In [38]:
print (f"You have {len(embedding_list)} embeddings")
print (f"Here's a sample of one: {embedding_list[0][:3]}...")

You have 4063 embeddings
Here's a sample of one: [0.011870243074141935, -0.013094191805474591, -0.004523136923396113]...


##### **Memory**

In [39]:
from langchain.memory import ChatMessageHistory
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

history = ChatMessageHistory()

history.add_ai_message("hi!")

history.add_user_message("what is the capital of france?")

In [40]:
history.messages

[AIMessage(content='hi!', additional_kwargs={}),
 HumanMessage(content='what is the capital of france?', additional_kwargs={})]

In [41]:
ai_response = chat(history.messages)
ai_response

AIMessage(content='The capital of France is Paris.', additional_kwargs={})

In [42]:
history.add_ai_message(ai_response.content)
history.messages

[AIMessage(content='hi!', additional_kwargs={}),
 HumanMessage(content='what is the capital of france?', additional_kwargs={}),
 AIMessage(content='The capital of France is Paris.', additional_kwargs={})]

##### **Chains**

##### **Agents**

### **Demo**

In [2]:
# # Install package
!pip install "unstructured[local-inference]"
!pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
!pip install layoutparser[layoutmodels,tesseract]
!pip install libmagic

Collecting detectron2@ git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2
  Cloning https://github.com/facebookresearch/detectron2.git (to revision v0.6) to c:\users\ferna\appdata\local\temp\pip-install-ale6wuwb\detectron2_86f0608e1ed1401da70b531d86e4bf38
  Resolved https://github.com/facebookresearch/detectron2.git to commit d1e04565d3bec8719335b88be9e9b961bf3ec464
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: detectron2
  Building wheel for detectron2 (setup.py): started
  Building wheel for detectron2 (setup.py): still running...
  Building wheel for detectron2 (setup.py): finished with status 'error'
  Running setup.py clean for detectron2
Failed to build detectron2
Installing collected packages: detectron2
  Running setup.py install for detectron2: started
  Running setup.py install for detectron2: still running...
  Running setup.py install for detectron2: finis

  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git 'C:\Users\ferna\AppData\Local\Temp\pip-install-ale6wuwb\detectron2_86f0608e1ed1401da70b531d86e4bf38'
  Running command git checkout -q d1e04565d3bec8719335b88be9e9b961bf3ec464
  error: subprocess-exited-with-error
  
  × python setup.py bdist_wheel did not run successfully.
  │ exit code: 1
  ╰─> [1039 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cpython-310
      creating build\lib.win-amd64-cpython-310\detectron2
      copying detectron2\__init__.py -> build\lib.win-amd64-cpython-310\detectron2
      creating build\lib.win-amd64-cpython-310\tools
      copying tools\analyze_model.py -> build\lib.win-amd64-cpython-310\tools
      copying tools\benchmark.py -> build\lib.win-amd64-cpython-310\tools
      copying tools\convert-torchvision-to-d2.py -> build\lib.win-amd64-cpython-310\tools




In [37]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chains import RetrievalQAWithSourcesChain
#import magic
import os
import nltk
import config

In [4]:
loader = DirectoryLoader('C:/Users/ferna/OneDrive/Documents/Morada Uno_tech/Demo_docs')

In [5]:
loader

<langchain.document_loaders.directory.DirectoryLoader at 0x297b67a7280>

In [6]:
loader1 = UnstructuredFileLoader('C:/Users/ferna/OneDrive/Documents/Morada Uno_tech/Demo_docs/eBook de Justicia Alternativa M1.pdf')

In [7]:
loader1

<langchain.document_loaders.unstructured.UnstructuredFileLoader at 0x297d8b5e7a0>

In [20]:
documents = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the

In [41]:
print(len(documents))
documents[1]

11


Document(page_content='Descubre la renta perfecta\n\n¿Qué es\n\nMorada Uno?\n\nSomos una empresa mexicana de\n\ntecnología, con la misión de\n\nempoderar a los profesionales\n\ninmobiliarios para lograr\n\ntransacciones más rápidas y\n\nseguras.\n\nSolucionamos los problemas\n\nexistentes en el mundo de las\n\nrentas, fomentando confianza en\n\ncada operación y liquidez para\n\ntodo el mercado.\n\nNuestro compromiso es brindar a\n\nlos arrendadores, arrendatarios y\n\nprofesionales inmobiliarios la\n\nexperiencia de renta más confiable,\n\nflexible y segura.\n\nRentas\n\nPerfectas\n\nToda la protección\n\nde una póliza\n\njurídica + la certeza\n\nde recibir la renta\n\npuntual, siempre.\n\nTodas las garantías de Morada Uno incluyen:\n\nLa mejor investigación del mercado\n\nConsulta buró - (inquilino y fiador)\n\nConsulta antecedentes legales - (inquilino y fiador)\n\nConsulta bases criminales - (inquilino y fiador)\n\nInmueble en Registro Público Propiedad (RPP)\n\nUn contrato de arren

In [22]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

In [23]:
texts = text_splitter.split_documents(documents)

In [24]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [44]:
#docsearch = Chroma.from_documents(texts, embeddings)
docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{"source": f"{i}-pl"} for i in range(len(texts))])

Using embedded DuckDB without persistence: data will be transient


AttributeError: 'Document' object has no attribute 'replace'

In [43]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type='stuff', retriever=docsearch.as_retriever(), return_source_documents=False)

In [35]:
query = "cuáles son los precios de las garantías de Morada Uno?"
result = qa({"query": query})

In [36]:
result["result"]

' El costo mínimo de garantía es $3,600 + IVA. La garantía M3 cuesta 30% de un mes de renta + IVA, y la garantía M12 cuesta 60% de un mes de renta + IVA.'

In [30]:
result["source_documents"]

KeyError: 'source_documents'

### **Hide**

In [None]:

from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
from langchain.chat_models import ChatOpenAI
import gradio as gr
import sys
import os

os.environ["OPENAI_API_KEY"] = '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'

def construct_index(directory_path):
    max_input_size = 4096
    num_outputs = 512
    max_chunk_overlap = 20
    chunk_size_limit = 600

    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

    llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo", max_tokens=num_outputs))

    documents = SimpleDirectoryReader(directory_path).load_data()

    index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

    index.save_to_disk('index.json')

    return index

def chatbot(input_text):
    index = GPTSimpleVectorIndex.load_from_disk('index.json')
    response = index.query(input_text, response_mode="compact")
    return response.response

iface = gr.Interface(fn=chatbot,
                     inputs=gr.components.Textbox(lines=7, label="Enter your text"),
                     outputs="text",
                     title="Custom-trained AI Chatbot")

index = construct_index("docs")
iface.launch(share=True)

ModuleNotFoundError: ignored

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI, VectorDBQA
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import GoogleDriveLoader
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
import logging
import chromadb.config

In [None]:
# Load documents from the specified directory using a DirectoryLoader object
loader = GoogleDriveLoader(folder_id="1t5TU1g33JuCxCAsH8LjjwiDmeZkkimLt")
#loader = DirectoryLoader('/content/Brochure Morada Uno - GDL.pdf', glob='*.pdf')
documents = loader.load()

# split the text to chuncks of of size 1000
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# Split the documents into chunks of size 1000 using a CharacterTextSplitter object
texts = text_splitter.split_documents(documents)

# Create a vector store from the chunks using an OpenAIEmbeddings object and a Chroma object
embeddings = OpenAIEmbeddings(openai_api_key='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')
docsearch = Chroma.from_documents(texts, embeddings)