In [36]:
print('Hi, We will create a RAG below')

Hi, We will create a RAG below


In [37]:

import boto3

In [38]:
bedrock_runtime = boto3.client(region_name = 'us-east-1', service_name = 'bedrock-runtime')

In [39]:
import json
import string

In [40]:
def embed_text(text: string):
    json_request = {'inputText': text}

    body = json.dumps(json_request)
    response = bedrock_runtime.invoke_model(body=body, modelId = 'amazon.titan-embed-text-v2:0')
    return json.loads(response.get('body').read())['embedding']

In [41]:
embed_text("hello how are you")

[-0.06819879,
 0.0455736,
 -0.0032725723,
 0.017049698,
 -0.031836875,
 -0.03781639,
 -0.02488771,
 0.04686647,
 -0.011959029,
 -0.00073733885,
 -0.0031109639,
 0.021332324,
 0.008807664,
 -0.011797422,
 -0.034422614,
 0.02925114,
 -0.022948408,
 -0.0006211827,
 -0.015514417,
 0.012120638,
 -0.019231413,
 0.06658271,
 0.037978,
 0.022463582,
 -0.019877847,
 -0.03571548,
 -0.034099396,
 -0.031028833,
 0.024402885,
 -0.022948408,
 0.085329294,
 0.013332702,
 -0.052361157,
 -0.033129744,
 -0.026827013,
 0.05785585,
 -0.004242223,
 0.024726102,
 0.04880577,
 -0.0129286805,
 0.060441583,
 -0.023271626,
 -0.0029089532,
 0.07821852,
 0.030705618,
 0.034099396,
 -0.015676025,
 0.036523525,
 0.030705618,
 -0.016564872,
 0.040886953,
 -0.00033836783,
 -0.001292868,
 0.018100154,
 -0.052361157,
 0.038462825,
 0.017211307,
 0.040402126,
 0.018584978,
 0.03781639,
 -0.006989568,
 0.014140745,
 0.016645677,
 -0.03668513,
 -0.006383536,
 0.006060319,
 -0.0044240328,
 0.0010504553,
 0.015676025,
 -5.8

In [42]:
import os
import pandas as pd

directory_path = 'rag-input'
data = []

for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)

        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        data.append([filename, content])

In [43]:
df = pd.DataFrame(data, columns=['filename','text'])

In [44]:
df

Unnamed: 0,filename,text
0,irs.txt,Different sources provide the authority for ta...
1,intuit.txt,Intuit Inc. is an American multinational busin...


In [45]:
df['embedding'] = df['text'].apply(embed_text)

In [46]:
df

Unnamed: 0,filename,text,embedding
0,irs.txt,Different sources provide the authority for ta...,"[-0.011584873, 0.011296575, 0.037186958, -0.02..."
1,intuit.txt,Intuit Inc. is an American multinational busin...,"[-0.0520132, -0.007097195, 0.01972329, -0.0099..."


In [47]:
import numpy as np

def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)

    mag1 = np.linalg.norm(vector1)
    mag2 = np.linalg.norm(vector2)

    dot_product = np.dot(vector1, vector2)

    return dot_product / (mag1 * mag2)

In [48]:
def most_similar_text(prompt):
    prompt_embedding = embed_text(prompt)
    df['prompt_similarity'] = df['embedding'].apply(lambda vector: cosine_similarity(vector, prompt_embedding))
    return df.nlargest(1,  'prompt_similarity').iloc[0]['text']

In [49]:
most_similar_text("When was federal income tax enacted?")

'Different sources provide the authority for tax rules and procedures. Here are some sources that can be searched online for free.\n\nInternal Revenue Code\nThe Constitution gives Congress the power to tax. Congress typically enacts Federal tax law in the Internal Revenue Code of 1986 (IRC).\n\nThe sections of the IRC can be found in Title 26 of the United States Code (26 USC). An electronic version of the current United States Code is made available to the public by Congress.\n\nBrowse "Title 26—Internal Revenue Code" to see the table of contents for the IRC.\n"Jump To" to a specific section of Title 26 to find the text for that IRC provision. For example, you can "Jump To" Title 26 Section 24 to find the provision for the child tax credit in the IRC.\nUse the Advanced Search feature to search only in Title 26 for a specific term. For example, you can search for "child tax credit" in Title 26 to find section 24 of the IRC, shown as 26 USC 24.\nCaution: Before relying on any IRC sectio

In [50]:
def retrieval_augmented_generation(prompt):

    rag_text = most_similar_text(prompt)

    full_prompt = f"{rag_text} \n\n Answer the following question: \n {prompt}"

    body = json.dumps({'inputText': full_prompt})

    response = bedrock_runtime.invoke_model(body=body, modelId='amazon.titan-text-express-v1')
    response_body = json.loads(response.get('body').read())
    return response_body['results'][0]['outputText']

In [51]:
retrieval_augmented_generation("When was federal income tax enacted?")

'\nFederal income tax was enacted in 1913.'

In [52]:
retrieval_augmented_generation("What are some of the notable acquisistions by Intuit?")

'\nIn 1993, Intuit acquired Chipsoft, a tax preparation software company based in San Diego.\nIn 1994, the firm acquired the tax preparation software division of Best Programs of Reston, VA.\nIn 1996, it acquired GALT Technologies, Inc of Pittsburgh, PA.\nIn 1998, it acquired Lacerte Software Corp., which now operates as an Intuit subsidiary.\nOn March 2, 1999, Intuit acquired Computing Resources Inc. of Reno, Nevada for approximately $200 million.\nIn December'