In [None]:
# the purpose of this notebook is become familiar with vector stores in general
# and aws bedrock in particular

# required setup
import boto3
import json
import matplotlib.pyplot as plt
# aws setup
#boto3.setup_default_session(profile_name='conveyor-demo-profile') # not needed on conveyor [remove]

bedrock = boto3.client('bedrock-agent-runtime', region_name='us-east-1') # note the region
knowledge_base_id ='ECZYEUIJ59' # this is the id of the decisions knowledge base in english

In [None]:
# wrapper around the bedrock client retrieval method
def retrieve(bedrock_client, knowledge_base_id, search_string, items=10):
    retrievals = bedrock_client.retrieve(
        knowledgeBaseId=knowledge_base_id,
        retrievalQuery={
            'text': search_string
        },
        retrievalConfiguration={
            'vectorSearchConfiguration': {
                'numberOfResults': items
            }
        }
    )
    return retrievals['retrievalResults']

In [None]:
results = retrieve(bedrock, knowledge_base_id, 'the covid 19 crises had an impact on mental health')
print(results[0].keys())
print(json.dumps(results, indent=2))

In [None]:
# what happens if we search for an exact string we know is in the knowledge base?
results = retrieve(bedrock, knowledge_base_id, "COVID-19: mental wellbeing action plan\n\n\nThe coronavirus does not only affect physical health,\nbut also has a major impact on the mental well-being of the\npopulation. The Flemish Government therefore approves the action plan mentally.\nwell-being or \u201cCaring for tomorrow\u201d is good. With that plan, she wants a strong signal.\nindicate to the population that they are aware of the psychosocial consequences of the\ntakes corona measures seriously and wants to implement adequate solutions. The\nmeasures in the plan not only want the general population, but also\nspecific target groups such as care providers, children, young people, families,\npeople with a psychological or social vulnerability, people\nwho have been seriously ill due to covid-19 and their loved ones, and the relatives of\nsupport and strengthen deceased patients.")
print(json.dumps(results, indent=2))

In [None]:
# lets take a look at scores between queries
num_results = 85
x = range(num_results)

query_nonsense = "par gert fre fdv dew"
query_simple = "the covid 19 crises had an impact on mental health"
query_exact = "COVID-19: mental wellbeing action plan\n\n\nThe coronavirus does not only affect physical health,\nbut also has a major impact on the mental well-being of the\npopulation. The Flemish Government therefore approves the action plan mentally.\nwell-being or \u201cCaring for tomorrow\u201d is good. With that plan, she wants a strong signal.\nindicate to the population that they are aware of the psychosocial consequences of the\ntakes corona measures seriously and wants to implement adequate solutions. The\nmeasures in the plan not only want the general population, but also\nspecific target groups such as care providers, children, young people, families,\npeople with a psychological or social vulnerability, people\nwho have been seriously ill due to covid-19 and their loved ones, and the relatives of\nsupport and strengthen deceased patients."
query_simple_rephrased = "emotional wellbeing was affected as a result of the coronavirus pandemic"

# for each query we will retrieve the top num_results results and plot the scores
for query in [query_nonsense, query_simple, query_exact, query_simple_rephrased]:
    scores = [item['score'] for item in retrieve(bedrock, knowledge_base_id, query, num_results)]
    plt.plot(x, scores)
plt.legend(['nonsense', 'simple', 'exact', 'rephrased'])

In [None]:
# lets take a look at different query lengths
num_results = 85
x = range(num_results)

query_short = 'climate change'
query_medium = 'the impacts of climate change in belgium'
query_long = 'what can we expect the long term impact of climate change on the belgian economy to be and what are the best strategies to mitigate these impacts?'

# for each query we will retrieve the top num_results results and plot the scores
for query in [query_short, query_medium, query_long]:
    scores = [item['score'] for item in retrieve(bedrock, knowledge_base_id, query, num_results)]
    plt.plot(x, scores)
plt.legend(['short', 'medium', 'long'])

for query in [query_short, query_medium, query_long]:
    text = retrieve(bedrock, knowledge_base_id, query, 1)[0]['content']['text']
    print(query)
    print(text)
    print('_________________________')

## where things start to break down

In [None]:
# exact lexical matching
results = retrieve(bedrock, knowledge_base_id, "Nelson Mandela", 5)
for item in results:
    print(item['content']['text'])
    print('_________________________')

In [None]:
# exact lexical matching
results = retrieve(bedrock, knowledge_base_id, "Alexander De Croo", 5)
for item in results:
    print(item['content']['text'])
    print('_________________________')

In [None]:
# relative time is not searchable
query = "what are the newest decisions"
results = retrieve(bedrock, knowledge_base_id, query, 5)
for item in results:
    print(item['content']['text'])
    print('_________________________')