In [1]:
import os
import yaml
import google.generativeai as genai

In [2]:
credentials = yaml.safe_load(open(os.path.expanduser('~/.secrets/google_ai.yml')))

In [3]:
api_key = credentials['api_key']
genai.configure(api_key=api_key)

In [4]:
sports_news_text = {'title':'Sports Section',
                    'text':'San Francisco 49ers are heading to the super bowl in a football showdown!'}

In [5]:
finance_news_text = {'title':'Finance Section',
                    'text':'Meta stock has reached all time high due to its big push in AI research'}

In [6]:
for model in genai.list_models():
    print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision
models/embedding-001
models/text-embedding-004
models/aqa


In [7]:
sports_embedding_vector = genai.embed_content(model='models/embedding-001',
                                              content=sports_news_text['text'],
                                              task_type='retrieval_document')

In [8]:
type(sports_embedding_vector)

dict

In [9]:
type(sports_embedding_vector['embedding'])

list

In [10]:
sports_embedding_vector['embedding']

[0.017034732,
 -0.002852212,
 -0.05543998,
 -0.02071096,
 0.04227543,
 -0.00046997407,
 -0.0052968594,
 0.0067575034,
 0.0108586075,
 0.03362889,
 -0.007843889,
 0.05088861,
 -0.014354059,
 0.025606241,
 -0.0055216746,
 -0.019924918,
 0.025058607,
 0.038961295,
 0.015219009,
 -0.012273454,
 0.04804724,
 0.010514121,
 -0.01890651,
 -0.0096841855,
 0.016060378,
 -0.040715527,
 0.07061795,
 -0.05924369,
 -0.02035509,
 0.025807017,
 -0.055238046,
 -0.000949542,
 -0.041503683,
 0.02579506,
 0.01932731,
 -0.07284029,
 -0.023513785,
 0.019454738,
 0.02661803,
 -0.0014038506,
 0.037862476,
 -0.033956554,
 -0.012977529,
 0.017805418,
 0.012731145,
 0.04063027,
 -0.01882565,
 0.021351354,
 0.045790687,
 -0.05137211,
 0.07544966,
 -0.0036192879,
 0.06400452,
 -0.004693121,
 -0.028665645,
 -0.019159509,
 0.051767167,
 -0.017864157,
 -0.008531156,
 -0.012861022,
 -0.0013564206,
 -0.026168708,
 0.018129522,
 0.008066004,
 -0.014225991,
 -0.035313647,
 -0.00829876,
 0.0133697195,
 0.038623534,
 0.008

In [11]:
len(sports_embedding_vector['embedding'])

768

In [12]:
finance_embedding_vector = genai.embed_content(model='models/embedding-001',
                                               content=finance_news_text['text'],
                                              task_type='retrieval_document')


In [13]:
len(finance_embedding_vector['embedding'])

768

In [14]:
def embed_text(text):
    # returns the embedding "array"
    return genai.embed_content(model='models/embedding-001',
                               content=text,
                               task_type='retrieval_document')['embedding']

In [15]:
# !pip install pandas

In [16]:
# store embedding
import pandas as pd  # or embedding Database chromaDb pinecone

In [17]:
documents = [finance_news_text, sports_news_text]

In [18]:
documents

[{'title': 'Finance Section',
  'text': 'Meta stock has reached all time high due to its big push in AI research'},
 {'title': 'Sports Section',
  'text': 'San Francisco 49ers are heading to the super bowl in a football showdown!'}]

In [19]:
df = pd.DataFrame(documents)

In [20]:
df

Unnamed: 0,title,text
0,Finance Section,Meta stock has reached all time high due to it...
1,Sports Section,San Francisco 49ers are heading to the super b...


In [21]:
df.columns = ['Title', 'Text']

In [22]:
df

Unnamed: 0,Title,Text
0,Finance Section,Meta stock has reached all time high due to it...
1,Sports Section,San Francisco 49ers are heading to the super b...


In [23]:
df['Embeddings'] = df['Text'].apply(embed_text)  # vector store

In [24]:
df

Unnamed: 0,Title,Text,Embeddings
0,Finance Section,Meta stock has reached all time high due to it...,"[0.040587973, -0.00784966, -0.017593645, 0.016..."
1,Sports Section,San Francisco 49ers are heading to the super b...,"[0.017034732, -0.002852212, -0.05543998, -0.02..."


In [25]:
import numpy as np

In [26]:
def query_similarity_score(query,vector):
    # query --> actual user's text "tell me about football sport?" text string
    # Vector --> Array (list), numpy array, eg Embedding 

    # Query String --> List
    query_embedding = embed_text(query)

    # Similarity between 2 lists
    return np.dot(query_embedding,vector)

In [27]:
query = "Any interesting news about Meta stock performance today"

In [28]:
df['Similarity'] = df['Embeddings'].apply(lambda vector: query_similarity_score(query,vector))

In [29]:
df

Unnamed: 0,Title,Text,Embeddings,Similarity
0,Finance Section,Meta stock has reached all time high due to it...,"[0.040587973, -0.00784966, -0.017593645, 0.016...",0.871182
1,Sports Section,San Francisco 49ers are heading to the super b...,"[0.017034732, -0.002852212, -0.05543998, -0.02...",0.660011


In [30]:
df.sort_values('Similarity',ascending=False)[['Title','Text']].iloc[0]

Title                                      Finance Section
Text     Meta stock has reached all time high due to it...
Name: 0, dtype: object

In [31]:
def most_similar_document(df, query):
    df['Similarity'] = df['Embeddings'].apply(lambda vector: query_similarity_score(query,vector))
    title = df.sort_values('Similarity',ascending=False)[['Title','Text']].iloc[0]['Title']
    text = df.sort_values('Similarity',ascending=False)[['Title','Text']].iloc[0]['Text']
    return title,text

In [32]:
title, text = most_similar_document(df, 'Any news about the San Francisco 49ers today?')

In [33]:
title

'Sports Section'

In [34]:
text

'San Francisco 49ers are heading to the super bowl in a football showdown!'

In [35]:
def RAG(df, query):
    # Retrieval Document
    title, text = most_similar_document(df, query)
    # Augmented Generation - Model
    model = genai.GenerativeModel('gemini-pro')
    # Augment --> \n prompt eng
    # prompt = f"Answer this query:\n{query}.\nOnly use this context to answer:\n{text}"
    prompt = f"Answer this query:\n{query}.\nOnly use this context to answer but give me any extra info that you maybe aware of:\n{text}"
    # Sourcing (title), to know which document was used
    response = model.generate_content(prompt)
    return f"{response.text}\n\nSource Doc Title: {title}"

In [36]:
print(RAG(df, "Do you have any information about the 49ers?"))

I do not have any other information about the 49ers from the provided context.

Source Doc Title: Sports Section


# Test with real world documents

In [37]:
#!pip install PyPDF2

In [38]:
import os
import pandas as pd
from PyPDF2 import PdfReader

# Initialize an empty DataFrame with columns 'Title' and 'Text'
df = pd.DataFrame(columns=['Title', 'Text'])
subdir = '03-Embeddings-and-RAG'
# Loop through each file in the sub directory 03-Embeddings-and-RAG
for file_name in os.listdir(subdir):
    if file_name.endswith('.pdf'):
        try:
            # Open the PDF file
            with open(f"{subdir}/{file_name}", 'rb') as file:
                # Initialize a PDF file reader
                pdf_reader = PdfReader(file)
                # Initialize text variable to store the content of the PDF
                text = ''
                # Iterate through each page in the PDF
                for page_num in range(len(pdf_reader.pages)):
                    # Extract text from the page
                    text += pdf_reader.pages[page_num].extract_text()
                    text = text.replace('\n',' ')
                # Create a new DataFrame with the file's title and text
                new_row = pd.DataFrame({'Title': [file_name], 'Text': [text]})
                # Concatenate the new DataFrame row to the existing DataFrame
                df = pd.concat([df, new_row], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

In [39]:
print(df['Text'].iloc[0])

Wonka Milk Chocolate Factory: Facility Safety Rules Preface: The Wonka Milk Chocolate Factory prioritizes the safety and well-being of its employees and visitors. These detailed safety rules are designed to ensure a safe and healthy work environment for everyone. General Rules: 1. Personal Protective Equipment (PPE): All employees must wear the designated PPE at all times while on the factory floor . This includes: ○ Hairnets or beard covers ○ Safety glasses or goggles ○ Disposable gloves ○ Protective clothing (aprons or overalls) ○ Closed-toe, non-slip shoes 2. Hygiene: Thorough handwashing with soap and water is required: ○ Before starting work ○ After using the restroom ○ After eating, drinking, or smoking ○ After handling trash or any unsanitary items ○ Before handling any food materials 3. Food Safety: Employees must adhere to strict food safety guidelines: ○ Do not eat, drink, or smoke in production areas. ○ Do not touch your face, hair, or clothing while handling food materials.

In [40]:
df.head(5)

Unnamed: 0,Title,Text
0,Wonka Chocolate Facility Rules.pdf,Wonka Milk Chocolate Factory: Facility Safety ...


In [41]:
# create vector embeddings and store it
df['Embeddings'] = df['Text'].apply(embed_text) 

In [42]:
df.head(1)

Unnamed: 0,Title,Text,Embeddings
0,Wonka Chocolate Facility Rules.pdf,Wonka Milk Chocolate Factory: Facility Safety ...,"[0.013815645, -0.057831157, -0.03580527, -0.02..."


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       1 non-null      object
 1   Text        1 non-null      object
 2   Embeddings  1 non-null      object
dtypes: object(3)
memory usage: 152.0+ bytes


In [44]:
query_doc = "What to do in case of injuries?"

In [45]:
# perform similarity search
print(RAG(df, query_doc))

In case of injuries, the following steps should be taken:

1. Report all accidents and injuries to supervisors immediately.
2. Thorough investigations will be conducted for all incidents.
3. Corrective actions will be taken to prevent future incidents.

Source Doc Title: Wonka Chocolate Facility Rules.pdf
