<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/langchain_opensourceLLM_mistral7B_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embeddings, PostgreSQL, Langchain, Openai, Mistral Text Generation and RAG

# References

https://github.com/langchain-ai/langchain/issues/10454
https://platform.openai.com/docs/guides/text-generation
https://python.langchain.com/docs/integrations/vectorstores/pgembedding
https://www.datacamp.com/tutorial/introduction-to-text-embeddings-with-the-open-ai-api
https://journal.everypixel.com/2023-the-year-of-ai

# Dependencies

In [None]:
#Install Libraries to access Google Drive and OpenAI resources.
%pip install colab-env --upgrade --quiet --root-user-action=ignore
%pip install openai==0.28  --root-user-action=ignore
%pip install langchain
%pip install "unstructured[all-docs]"
%pip install tiktoken
!pip install -q -U sentence-transformers

# Enviroment Variables

In [None]:

import colab_env
import os
import openai
from openai.embeddings_utils import cosine_similarity

connection_string = os.getenv("DATABASE_URL")
openai.api_key = os.getenv("OPENAI_API_KEY")

Mounted at /content/gdrive


# Embedding settings with OpenAI


In [None]:
def get_embedding(text: str) -> list:
 response = openai.Embedding.create(
     input=text,
     model="text-embedding-ada-002"
 )
 return response['data'][0]['embedding']

good_ride = "good ride"
good_ride_embedding = get_embedding(good_ride)

len(good_ride_embedding)
# 1536

good_ride_review_1 = "I really enjoyed the trip! The ride was incredibly smooth, the pick-up location was convenient, and the drop-off point was right in front of the coffee shop."
good_ride_review_1_embedding = get_embedding(good_ride_review_1)
similary=cosine_similarity(good_ride_review_1_embedding, good_ride_embedding)
# 0.8300454513797334
similary

0.8300454513797334

# PostgreSQL Settings - PGVECTOR and PGEMBEDDINGS

In [None]:
# https://python.langchain.com/docs/integrations/vectorstores/pgembedding

# install PSQL WITH DEV Libraries AND PGVECTOR
!apt install postgresql postgresql-contrib &>log
!service postgresql restart
!sudo apt install postgresql-server-dev-all

%cd /content/gdrive/MyDrive/tools/pgvector
!cp -pr /content/gdrive/MyDrive/tools/pgvector /content/
%cd /content/pgvector/
print()
print('START: PG VECTOR COMPILATION')
!make
!make install # may need sudo
print('END: PG VECTOR COMPILATION')
print()

%cd /content/
!git clone https://github.com/neondatabase/pg_embedding.git
%cd /content/pg_embedding
print()
print('START: PG embedding COMPILATION')
!make
!make install # may need sudo
print('END: PG embedding COMPILATION')
print()

#!ls /usr/share/postgresql/14/extension/*control*

 * Restarting PostgreSQL 14 database server
   ...done.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  binfmt-support libffi-dev libpfm4 libz3-4 libz3-dev llvm-14 llvm-14-dev
  llvm-14-runtime llvm-14-tools postgresql-server-dev-14 python3-pygments
  python3-yaml
Suggested packages:
  llvm-14-doc python-pygments-doc ttf-bitstream-vera
The following NEW packages will be installed:
  binfmt-support libffi-dev libpfm4 libz3-4 libz3-dev llvm-14 llvm-14-dev
  llvm-14-runtime llvm-14-tools postgresql-server-dev-14
  postgresql-server-dev-all python3-pygments python3-yaml
0 upgraded, 13 newly installed, 0 to remove and 30 not upgraded.
Need to get 59.8 MB of archives.
After this operation, 361 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 python3-yaml amd64 5.4.1-1ubuntu1 [129 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 bi

In [None]:
import psycopg2 as ps

# PostGRES SQL Settings
%cd /content/
!sudo -u postgres psql -c "ALTER USER postgres PASSWORD 'postgres'"

#!sudo -u postgres psql -c "DROP EXTENSION embedding"
!sudo -u postgres psql -c "CREATE EXTENSION embedding"

!sudo -u postgres psql -c "DROP TABLE documents"
!sudo -u postgres psql -c "CREATE TABLE documents(id integer PRIMARY KEY, embedding real[])"

h="{0,1,2}"
hh= "INSERT INTO documents(id, embedding) VALUES (1,'%s'), (2,'{1,2,3}'),  (3,'{1,1,1}')"%h
print(hh)

def insert_document(id,embedding):
    #review_embedding=get_embedding(text)
    ### INSERT INTO DB
    DB_NAME = "postgres"
    DB_USER = "postgres"
    DB_PASS = "postgres"
    DB_HOST = "localhost"
    DB_PORT = "5432"
    conn = ps.connect(database=DB_NAME,
							user=DB_USER,
							password=DB_PASS,
							host=DB_HOST,
							port=DB_PORT)


    cur = conn.cursor() # creating a cursor

    cur.execute("""
        INSERT INTO documents
        (id, embedding)
        VALUES ('%s',
                '%s')""" % (id,embedding))

    conn.commit()
    print("INSERT EMBEDDING %s successfully"%embedding)
    conn.close()
    cur.close()


insert_document(1,'{0,1,2}')
insert_document(2,"{1,2,3}")
insert_document(3,"{1,1,1}")


!sudo -u postgres psql -c "CREATE INDEX ON documents USING hnsw(embedding) WITH (dims=3, m=3, efconstruction=5, efsearch=5)"
!sudo -u postgres psql -c "SET enable_seqscan = off"

ARRAY = [3, 3, 3]

def select_document(HNSW_index):
    DB_NAME = "postgres"
    DB_USER = "postgres"
    DB_PASS = "postgres"
    DB_HOST = "localhost"
    DB_PORT = "5432"
    conn = ps.connect(database=DB_NAME,
							user=DB_USER,
							password=DB_PASS,
							host=DB_HOST,
							port=DB_PORT)

    cur = conn.cursor() # creating a cursor

    cur.execute("""
    SELECT id FROM documents
    ORDER BY embedding %s ARRAY[%s,%s,%s] LIMIT 1
    """ % (HNSW_index,str(ARRAY[0]), str(ARRAY[1]), str(ARRAY[2])))

    conn.commit()
    print(cur.fetchone())
    #print("INSERT EMBEDDING %s successfully"%embedding)
    conn.close()
    cur.close()

# <->, <=>, and <~> operators define the distance metric, which calculates the distance between the query vector and each row of the dataset.
select_document('<->')
select_document('<=>')
select_document('<~>')

/content
ALTER ROLE
CREATE EXTENSION
ERROR:  table "documents" does not exist
CREATE TABLE
INSERT INTO documents(id, embedding) VALUES (1,'{0,1,2}'), (2,'{1,2,3}'),  (3,'{1,1,1}')
INSERT EMBEDDING {0,1,2} successfully
INSERT EMBEDDING {1,2,3} successfully
INSERT EMBEDDING {1,1,1} successfully
CREATE INDEX
SET
(2,)
(3,)
(2,)


# Documents loader

Postgres with the pg_embedding extension as a vector store.

pg_embedding uses sequential scan by default. but you can create a HNSW index using the create_hnsw_index method.

State of the Union

In [None]:
#%pip install -q langchain
#%pip install -q "unstructured[all-docs]"

## Loading Environment Variables
from typing import List, Tuple

from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import PGEmbedding
#import getpass

!git clone https://github.com/hwchase17/chat-your-data.git
from langchain.document_loaders import UnstructuredFileLoader

#loader = UnstructuredFileLoader("/content/chat-your-data/state_of_the_union.txt")
loader = TextLoader("/content/chat-your-data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs0 = text_splitter.split_documents(documents)

collection_name0 = "state_of_the_union"
print(f'# of Document Pages {len(documents)}')
print(f'# of Document Chunks: {len(docs0)}')

fatal: destination path 'chat-your-data' already exists and is not an empty directory.
# of Document Pages 1
# of Document Chunks: 42


AWS documents

In [None]:
#!mkdir -p /content/data

from urllib.request import urlretrieve
urls = [
    'https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/2022-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/2021-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2021/ar/Amazon-2020-Shareholder-Letter-and-1997-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2020/ar/2019-Shareholder-Letter.pdf'
]

filenames = [
    'AMZN-2022-Shareholder-Letter.pdf',
    'AMZN-2021-Shareholder-Letter.pdf',
    'AMZN-2020-Shareholder-Letter.pdf',
    'AMZN-2019-Shareholder-Letter.pdf'
]

metadata = [
    dict(year=2022, source=filenames[0]),
    dict(year=2021, source=filenames[1]),
    dict(year=2020, source=filenames[2]),
    dict(year=2019, source=filenames[3])]

data_root = "/content/data/"

for idx, url in enumerate(urls):
    file_path = data_root + filenames[idx]
    urlretrieve(url, file_path)

In [None]:
from pypdf import PdfReader, PdfWriter
import glob

local_pdfs = glob.glob(data_root + '*.pdf')

for local_pdf in local_pdfs:
    pdf_reader = PdfReader(local_pdf)
    pdf_writer = PdfWriter()
    for pagenum in range(len(pdf_reader.pages)-3):
        page = pdf_reader.pages[pagenum]
        pdf_writer.add_page(page)

    with open(local_pdf, 'wb') as new_file:
        new_file.seek(0)
        pdf_writer.write(new_file)
        new_file.truncate()


In [None]:
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

documents = []

for idx, file in enumerate(filenames):
    loader = PyPDFLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = metadata[idx]

    documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 512,
    chunk_overlap  = 100,
)

docs = text_splitter.split_documents(documents)

print(f'# of Document Pages {len(documents)}')
print(f'# of Document Chunks: {len(docs)}')

collection_name = "AWS"

# of Document Pages 25
# of Document Chunks: 299


In [None]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


#!pip install tiktoken
%cd /content/

# https://supabase.com/blog/fewer-dimensions-are-better-pgvector
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

#https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

db = PGEmbedding.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=collection_name,
    connection_string=connection_string,
)

#del query
query = "What did the president say about Ketanji Brown Jackson"
#query = "What did the president say about AWS"
query = "How has AWS evolved?"
#query = "What are the issues with AWS?"
print(query)

#docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query)

#for doc, score in docs_with_score:
#    print("-" * 80)
#   print("Score: ", score)
#    print(doc.page_content)
#    print("-" * 80)

print()

results_with_scores = db.similarity_search_with_score(query)
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}\nMetadata: {doc.metadata}\nScore: {score}\n\n")


/content
How has AWS evolved?

Content: customersmuch more functionality in AWS than they can find anywhere else (which is a significant differentiator), butalso allowed us to arrive at the much more game-changing offering that AWS is today.
Metadata: {'year': 2021, 'source': 'AMZN-2021-Shareholder-Letter.pdf'}
Score: 0.52016145


Content: in AWS. Our new customer pipeline is robust, as are our active migrations. Many companies usediscontinuous periods like this to step back and determine what they strategically want to change, and wefind an increasing number of enterprises opting out of managing their own infrastructure, and preferring tomove to AWS to enjoy the agility, innovation, cost-efficiency, and security benefits. And most importantlyfor customers, AWS continues to deliver new capabilities rapidly (over 3,300 new features and
Metadata: {'year': 2022, 'source': 'AMZN-2022-Shareholder-Letter.pdf'}
Score: 0.5205847


Content: done innovating here,and this long-term investment sho

In [None]:
filter={"year": 2022}

results_with_scores = db.similarity_search_with_score(query,filter=filter)

for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}\nMetadata: {doc.metadata}\nScore: {score}\n\n")

Content: in AWS. Our new customer pipeline is robust, as are our active migrations. Many companies usediscontinuous periods like this to step back and determine what they strategically want to change, and wefind an increasing number of enterprises opting out of managing their own infrastructure, and preferring tomove to AWS to enjoy the agility, innovation, cost-efficiency, and security benefits. And most importantlyfor customers, AWS continues to deliver new capabilities rapidly (over 3,300 new features and
Metadata: {'year': 2022, 'source': 'AMZN-2022-Shareholder-Letter.pdf'}
Score: 0.5205847


Content: done innovating here,and this long-term investment should prove fruitful for both customers and AWS. AWS is still in the earlystages of its evolution, and has a chance for unusual growth in the next decade.
Metadata: {'year': 2022, 'source': 'AMZN-2022-Shareholder-Letter.pdf'}
Score: 0.5220351


Content: We had a head start on potential competitors;and if anything, we wanted to accele

In [None]:
db = PGEmbedding.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=collection_name,
    connection_string=connection_string,
    pre_delete_collection=False,
)

# https://github.com/langchain-ai/langchain/issues/10454

import sqlalchemy

dims=1536
m=8,
ef_construction=16,
ef_search=16

create_index_query = sqlalchemy.text(
        "CREATE INDEX IF NOT EXISTS langchain_pg_embedding_idx "
        "ON langchain_pg_embedding USING hnsw (embedding) "
        "WITH ("
        "dims = {}, "
        "m = {}, "
        "efconstruction = {}, "
        "efsearch = {}"
        ");".format(dims, m, ef_construction, ef_search)
    )

In [None]:
!sudo -u postgres psql -c "CREATE INDEX ON documents USING hnsw(embedding) WITH (dims=3, m=8, efconstruction=16, efsearch=16)"

CREATE INDEX


In [None]:
store = PGEmbedding(
    connection_string=connection_string,
    embedding_function=embeddings,
    collection_name=collection_name,
)

retriever = store.as_retriever()
retriever


db1 = PGEmbedding.from_existing_index(
    embedding=embeddings,
    collection_name=collection_name,
    pre_delete_collection=False,
    connection_string=connection_string,
)
#del query
#query = "What did the president say about Ketanji Brown Jackson"
#query = "What did the president say about AWS"
#query = "How has AWS evolved?"
#query = "Amazon inventions"

docs_with_score: List[Tuple[Document, float]] = db1.similarity_search_with_score(query)

print(query)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)
#VectorStoreRetriever(vectorstore=<langchain.vectorstores.pghnsw.HNSWVectoreStore object at 0x121d3c8b0>, search_type='similarity', search_kwargs={})

How has AWS evolved?
--------------------------------------------------------------------------------
Score:  0.5201462
customersmuch more functionality in AWS than they can find anywhere else (which is a significant differentiator), butalso allowed us to arrive at the much more game-changing offering that AWS is today.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.52016145
customersmuch more functionality in AWS than they can find anywhere else (which is a significant differentiator), butalso allowed us to arrive at the much more game-changing offering that AWS is today.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.52048385
in AWS. Our new customer pipeline is robust, as are our active migrations. Many companies usediscontinuous periods

In [None]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker

print(connection_string)
engine = create_engine(os.getenv("DATABASE_URL"))
#!ls /usr/share/postgresql/14/extension/*control*

postgresql://postgres:postgres@localhost:5432/postgres


In [None]:
# https://towardsdatascience.com/4-ways-of-question-answering-in-langchain-188c6707cc5a


from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from langchain.llms import OpenAI

# load document
#from langchain.document_loaders import PyPDFLoader
#loader = PyPDFLoader("materials/example.pdf")
#documents = loader.load()

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# create the vectorestore to use as the index
#db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})
print(retriever)

# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=True)

query = "How AWS has evolved?"
#query = "How many AI publications in 2022?"
result = qa({"query": query})
print()
print(result['result'])
print()
#print(result['source_documents'])

tags=['PGEmbedding', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.pgembedding.PGEmbedding object at 0x7911ccec4ac0> search_kwargs={'k': 2}

 AWS has evolved by offering customers much more functionality than they can find anywhere else, which sets them apart from their competition and has led to their game-changing offering that they have today.



# LLM generation with Mistral-7B for Text Generation, Langchain

It is recommended use of GPU: It was tested with T4

In [None]:
#https://platform.openai.com/docs/guides/text-generation

!pip install gradio --quiet
!pip install xformer --quiet
!pip install chromadb --quiet
!pip install langchain --quiet
!pip install accelerate --quiet
!pip install transformers --quiet
!pip install bitsandbytes --quiet
!pip install unstructured --quiet
!pip install sentence-transformers --quiet
!pip install pypdf

%pip install openai==0.28  --root-user-action=ignore
%pip install tiktoken

!pip install -U transformers

In [None]:
import torch
from textwrap import fill
from IPython.display import Markdown, display

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    )

from langchain import PromptTemplate
from langchain import HuggingFacePipeline

from langchain.vectorstores import Chroma
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader, UnstructuredURLLoader
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import warnings
warnings.filterwarnings('ignore')

### T4
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
#MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

### A100
# https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
#Quantization
#Quantization techniques reduces memory and computational costs by representing weights and activations
#with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn’t
#be able to fit into memory, and speeding up inference. Transformers supports the AWQ and GPTQ quantization
#algorithms and it supports 8-bit and 4-bit quantization with bitsandbytes.


#( load_in_8bit = Falseload_in_4bit = Falsellm_int8_threshold = 6.0llm_int8_skip_modules = Nonellm_int8_enable_fp32_cpu_offload = Falsellm_int8_has_fp16_weight = False
#bnb_4bit_compute_dtype = None
# bnb_4bit_quant_type = 'fp4'bnb_4bit_use_double_quant = False**kwargs )


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    load_in_8bit_fp32_cpu_offload=True,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=quantization_config
)

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.8
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
    pad_token_id=tokenizer.eos_token_id
)

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

HuggingFacePipeline definitions

Language generation pipeline using any ModelWithLMHead. This pipeline predicts the words that will follow a
specified text prompt.

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline,)

In [None]:
query = "How AWS has evolved?"
result = llm(query)

display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

<b>How AWS has evolved?</b>

<p>
        - First, it was a Web hosting service
        - Then EC2 came out and became the “Everything Compute Cloud”
        - S3 as data storage in 10 locations worldwide. Now its an entire data infrastructure for various purposes
        - RDS – Relational database support through MySQL
    - AWS Marketplace:
        - Offers many products/services that are available to use
- Google (Cloud Platform)
    - Infrastructure is all over the world but does not give you specific location like Amazon does
    - GCP allows us to manage the resources by ourselves or leave them to google themselves so we can focus on app development
    - Also offers App Engine where you build your application within their system and they host the solution
- Microsoft Azure:
    - Has been very good at offering services that allow you to run Windows applications seamlessly without having to worry about anything behind the scene
    - Has had more success with clients who have already invested money into using Microsoft software
    - Uses Docker which containers virtual servers similar to VMs
    - Is now being used much more due to the increasing number of companies switching from Windows OS to Linux because it is free
- IBM Bluemix:
    - Not mentioned in this episode but I just wanted to add it here
- What do developers look for when looking at cloud platforms?
    1. Cost: need to know what you will be charged for; most providers offer free trials
    2. Scalability: needs to easily scale up based on demand spikes without causing issues. This should also apply downwards
    3. Security: must ensure there’s no risk of losing sensitive data. You need encryption features built in by default
    4. Flexibility: ability to use preferred tools while keeping things easy and streamlined is important
    5. Geographic distribution: users want their apps hosted near them for best performance
    6. Ease of Use: how simple is the platform to get started with? Can newbies pick it up quickly
    7. Community Support & Documentation: if something goes wrong, great docs and community support helps out immensely
    8. Customizability: devs don’t always want to be locked into one way of doing things; customization options are key
    9. Performance: how fast does everything run? Good perf means happier customers
    10. Monitoring capabilities: developers should be able to monitor usage patterns
    11. Developer happiness: happy developers make better software! So choose something enjoyable to work with
    12. Automated backups: automatic daily backups and point in time recovery are crucial
    13. DevOps friendly: CI/CD pipelines must integrate smoothly with other tools such as GitHub
- Which platforms meet these requirements best? There isn't really a clear winner right now as each provider has strengths and weaknesses depending on what exactly you’re trying to accomplish. It may take some research before committing.</p>

RAG implemenation

In [None]:
data_root = "/content/data/"

from pypdf import PdfReader, PdfWriter
import glob

local_pdfs = glob.glob(data_root + '*.pdf')

for local_pdf in local_pdfs:
    pdf_reader = PdfReader(local_pdf)
    pdf_writer = PdfWriter()
    for pagenum in range(len(pdf_reader.pages)-3):
        page = pdf_reader.pages[pagenum]
        pdf_writer.add_page(page)

    with open(local_pdf, 'wb') as new_file:
        new_file.seek(0)
        pdf_writer.write(new_file)
        new_file.truncate()

AWS DOCUMENTS - Shareholder-Letter - 2019:2022

In [None]:
# https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working


!mkdir -p /content/data

from urllib.request import urlretrieve
urls = [
    'https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/2022-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/2021-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2021/ar/Amazon-2020-Shareholder-Letter-and-1997-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2020/ar/2019-Shareholder-Letter.pdf'
]

filenames = [
    'AMZN-2022-Shareholder-Letter.pdf',
    'AMZN-2021-Shareholder-Letter.pdf',
    'AMZN-2020-Shareholder-Letter.pdf',
    'AMZN-2019-Shareholder-Letter.pdf'
]

metadata = [
    dict(year=2022, source=filenames[0]),
    dict(year=2021, source=filenames[1]),
    dict(year=2020, source=filenames[2]),
    dict(year=2019, source=filenames[3])]

data_root = "/content/data"

for idx, url in enumerate(urls):
    file_path = data_root + filenames[idx]
    urlretrieve(url, file_path)

In [None]:
from pypdf import PdfReader, PdfWriter
import glob

local_pdfs = glob.glob(data_root + '*.pdf')

for local_pdf in local_pdfs:
    pdf_reader = PdfReader(local_pdf)
    pdf_writer = PdfWriter()
    for pagenum in range(len(pdf_reader.pages)-3):
        page = pdf_reader.pages[pagenum]
        pdf_writer.add_page(page)

    with open(local_pdf, 'wb') as new_file:
        new_file.seek(0)
        pdf_writer.write(new_file)
        new_file.truncate()

In [None]:
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

documents = []

for idx, file in enumerate(filenames):
    loader = PyPDFLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = metadata[idx]

    documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 512,
    chunk_overlap  = 100,
)

docs = text_splitter.split_documents(documents)

print(f'# of Document Pages {len(documents)}')
print(f'# of Document Chunks: {len(docs)}')

# of Document Pages 25
# of Document Chunks: 299


# PSQL WITH DEV Libraries, PGVECTOR and PG Embedding

In [None]:
# https://python.langchain.com/docs/integrations/vectorstores/pgembedding

# install PSQL WITH DEV Libraries AND PGVECTOR
!apt install postgresql postgresql-contrib &>log
!service postgresql restart
!sudo apt install postgresql-server-dev-all

%cd /content/gdrive/MyDrive/tools/pgvector
!cp -pr /content/gdrive/MyDrive/tools/pgvector /content/
%cd /content/pgvector/
print()
print('START: PG VECTOR COMPILATION')
!make
!make install # may need sudo
print('END: PG VECTOR COMPILATION')
print()

%cd /content/
!git clone https://github.com/neondatabase/pg_embedding.git
%cd /content/pg_embedding
print()
print('START: PG embedding COMPILATION')
!make
!make install # may need sudo
print('END: PG embedding COMPILATION')
print()

 * Restarting PostgreSQL 14 database server
   ...done.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
postgresql-server-dev-all is already the newest version (238).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
/content/gdrive/MyDrive/tools/pgvector
/content/pgvector

START: PG VECTOR COMPILATION
make: Nothing to be done for 'all'.
/bin/mkdir -p '/usr/lib/postgresql/14/lib'
/bin/mkdir -p '/usr/share/postgresql/14/extension'
/bin/mkdir -p '/usr/share/postgresql/14/extension'
/usr/bin/install -c -m 755  vector.so '/usr/lib/postgresql/14/lib/vector.so'
/usr/bin/install -c -m 644 .//vector.control '/usr/share/postgresql/14/extension/'
/usr/bin/install -c -m 644 .//sql/vector--0.1.0--0.1.1.sql .//sql/vector--0.1.1--0.1.3.sql .//sql/vector--0.1.3--0.1.4.sql .//sql/vector--0.1.4--0.1.5.sql .//sql/vector--0.1.5--0.1.6.sql .//sql/vector--0.1.6--0.1.7.sql .//sql/vector--0.1.7--0.1.8.sql .//sql/vector--0.1.8--0.2.0.sql .//sql/ve

In [None]:
import psycopg2 as ps

# PostGRES SQL Settings
%cd /content/
!sudo -u postgres psql -c "ALTER USER postgres PASSWORD 'postgres'"

#!sudo -u postgres psql -c "DROP EXTENSION embedding"
!sudo -u postgres psql -c "CREATE EXTENSION embedding"

!sudo -u postgres psql -c "DROP TABLE documents"
!sudo -u postgres psql -c "CREATE TABLE documents(id integer PRIMARY KEY, embedding real[])"

h="{0,1,2}"
hh= "INSERT INTO documents(id, embedding) VALUES (1,'%s'), (2,'{1,2,3}'),  (3,'{1,1,1}')"%h
print(hh)

def insert_document(id,embedding):
    #review_embedding=get_embedding(text)
    ### INSERT INTO DB
    DB_NAME = "postgres"
    DB_USER = "postgres"
    DB_PASS = "postgres"
    DB_HOST = "localhost"
    DB_PORT = "5432"
    conn = ps.connect(database=DB_NAME,
							user=DB_USER,
							password=DB_PASS,
							host=DB_HOST,
							port=DB_PORT)


    cur = conn.cursor() # creating a cursor

    cur.execute("""
        INSERT INTO documents
        (id, embedding)
        VALUES ('%s',
                '%s')""" % (id,embedding))

    conn.commit()
    print("INSERT EMBEDDING %s successfully"%embedding)
    conn.close()
    cur.close()


insert_document(1,'{0,1,2}')
insert_document(2,"{1,2,3}")
insert_document(3,"{1,1,1}")


!sudo -u postgres psql -c "CREATE INDEX ON documents USING hnsw(embedding) WITH (dims=3, m=3, efconstruction=5, efsearch=5)"
!sudo -u postgres psql -c "SET enable_seqscan = off"

ARRAY = [3, 3, 3]

def select_document(HNSW_index):
    DB_NAME = "postgres"
    DB_USER = "postgres"
    DB_PASS = "postgres"
    DB_HOST = "localhost"
    DB_PORT = "5432"
    conn = ps.connect(database=DB_NAME,
							user=DB_USER,
							password=DB_PASS,
							host=DB_HOST,
							port=DB_PORT)

    cur = conn.cursor() # creating a cursor

    cur.execute("""
    SELECT id FROM documents
    ORDER BY embedding %s ARRAY[%s,%s,%s] LIMIT 1
    """ % (HNSW_index,str(ARRAY[0]), str(ARRAY[1]), str(ARRAY[2])))

    conn.commit()
    print(cur.fetchone())
    #print("INSERT EMBEDDING %s successfully"%embedding)
    conn.close()
    cur.close()

# <->, <=>, and <~> operators define the distance metric, which calculates the distance between the query vector and each row of the dataset.
select_document('<->')
select_document('<=>')
select_document('<~>')

#!sudo -u postgres psql -c "SELECT id FROM documents ORDER BY embedding <-> ARRAY[3,3,3] LIMIT 1"
#CREATE EXTENSION embedding;
#CREATE TABLE documents(id integer PRIMARY KEY, embedding real[]);
#INSERT INTO documents(id, embedding) VALUES (1, '{0,1,2}'), (2, '{1,2,3}'),  (3, '{1,1,1}');
#SELECT id FROM documents ORDER BY embedding <-> ARRAY[3,3,3] LIMIT 1;


/content
ALTER ROLE
ERROR:  extension "embedding" already exists
DROP TABLE
CREATE TABLE
INSERT INTO documents(id, embedding) VALUES (1,'{0,1,2}'), (2,'{1,2,3}'),  (3,'{1,1,1}')
INSERT EMBEDDING {0,1,2} successfully
INSERT EMBEDDING {1,2,3} successfully
INSERT EMBEDDING {1,1,1} successfully
CREATE INDEX
SET
(2,)
(3,)
(2,)


Postgres with the pg_embedding extension as a vector store.

pg_embedding uses sequential scan by default. but you can create a HNSW index
using the create_hnsw_index method.

In [None]:
%cd /content/
%pip install colab-env --upgrade --quiet --root-user-action=ignore
import colab_env
import os
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import PGEmbedding
import openai

connection_string = os.getenv("DATABASE_URL")
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
collection_name = "AWS"
from langchain.vectorstores import PGEmbedding

db = PGEmbedding.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=collection_name,
    connection_string=connection_string,
)

/content
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


  warn_deprecated(


# Load chain from chain type

In [None]:
import torch
from textwrap import fill
from IPython.display import Markdown, display

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    )

from langchain import PromptTemplate
from langchain import HuggingFacePipeline

from langchain.vectorstores import Chroma
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader, UnstructuredURLLoader
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import warnings
warnings.filterwarnings('ignore')

In [None]:
#!pip install colab-env --upgrade --quiet --root-user-action=ignore
from langchain.llms import OpenAI
#import colab_env

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions
#qa = RetrievalQA.from_chain_type(
#     llm=OpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=True)

qa = RetrievalQA.from_chain_type(
     llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

query = "How AWS has evolved?"
#query = "How many AI publications in 2022?"
result = llm(query)

display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

print()
print('chain to answer questions')
print("-" * 80)
result = qa({"query": query})
print(f'Query: {result["query"]}\n')
print(f'Result: {result["result"]}\n')
print(f'Context Documents: ')
for srcdoc in result["source_documents"]:
      print(f'{srcdoc}\n')

In [None]:
query = "Why is Amazon successful?"
result = llm(query)

display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

print()
print('chain to answer questions')
print("-" * 80)
result = qa({"query": query})
print(f'Query: {result["query"]}\n')
print(f'Result: {result["result"]}\n')
print(f'Context Documents: ')
for srcdoc in result["source_documents"]:
      print(f'{srcdoc}\n')

In [None]:
query = "What business challenges has Amazon experienced?"
result = llm(query)

display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

print()
print('chain to answer questions')
print("-" * 80)
result = qa({"query": query})
print(f'Query: {result["query"]}\n')
print(f'Result: {result["result"]}\n')
print(f'Context Documents: ')
for srcdoc in result["source_documents"]:
      print(f'{srcdoc}\n')