In [1]:
# List all the models aavailable in Amazon Bedrock

import boto3
import pandas as pd

# Create a Bedrock client
bedrock_client = boto3.client('bedrock')

# Create a Bedrock Runtime client
bedrock_runtime_client = boto3.client('bedrock-runtime')

In [2]:
# store all the models in a variable
all_models = bedrock_client.list_foundation_models()
all_models_summaries = all_models['modelSummaries'] 

# Convert the model summaries to a DataFrame with only the columns we want
df = pd.DataFrame(all_models_summaries, columns=['modelName', 'providerName', 'modelId', 'modelArn', 'inputModalities', 'outputModalities'])

# Show the DataFrame without truncating the output and display all rows
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Print the models that are provided by Amazon and show the first 10 rows and select the columns modelName, providerName, modelId
df[df['providerName'] == 'Amazon'][:10][['modelName', 'providerName', 'modelId']]

Unnamed: 0,modelName,providerName,modelId
0,Titan Text Large,Amazon,amazon.titan-tg1-large
1,Titan Image Generator G1,Amazon,amazon.titan-image-generator-v1:0
2,Titan Image Generator G1,Amazon,amazon.titan-image-generator-v1
3,Titan Image Generator G1 v2,Amazon,amazon.titan-image-generator-v2:0
4,Nova Premier,Amazon,amazon.nova-premier-v1:0:8k
5,Nova Premier,Amazon,amazon.nova-premier-v1:0:20k
6,Nova Premier,Amazon,amazon.nova-premier-v1:0:1000k
7,Nova Premier,Amazon,amazon.nova-premier-v1:0:mm
8,Nova Premier,Amazon,amazon.nova-premier-v1:0
9,Titan Text G1 - Premier,Amazon,amazon.titan-text-premier-v1:0


In [3]:
# Print all the embedding models
df[df['modelId'].str.contains('emb')][['modelName', 'providerName', 'modelId', 'inputModalities', 'outputModalities']]


Unnamed: 0,modelName,providerName,modelId,inputModalities,outputModalities
28,Titan Text Embeddings v2,Amazon,amazon.titan-embed-g1-text-02,[TEXT],[EMBEDDING]
33,Titan Embeddings G1 - Text,Amazon,amazon.titan-embed-text-v1:2:8k,[TEXT],[EMBEDDING]
34,Titan Embeddings G1 - Text,Amazon,amazon.titan-embed-text-v1,[TEXT],[EMBEDDING]
35,Titan Text Embeddings V2,Amazon,amazon.titan-embed-text-v2:0:8k,[TEXT],[EMBEDDING]
36,Titan Text Embeddings V2,Amazon,amazon.titan-embed-text-v2:0,[TEXT],[EMBEDDING]
37,Titan Multimodal Embeddings G1,Amazon,amazon.titan-embed-image-v1:0,"[TEXT, IMAGE]",[EMBEDDING]
38,Titan Multimodal Embeddings G1,Amazon,amazon.titan-embed-image-v1,"[TEXT, IMAGE]",[EMBEDDING]
72,Embed English,Cohere,cohere.embed-english-v3:0:512,[TEXT],[EMBEDDING]
73,Embed English,Cohere,cohere.embed-english-v3,[TEXT],[EMBEDDING]
74,Embed Multilingual,Cohere,cohere.embed-multilingual-v3:0:512,[TEXT],[EMBEDDING]


In [80]:
from langchain_community.document_loaders import BSHTMLLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_aws import BedrockEmbeddings
from langchain_community.vectorstores import FAISS
import os

def create_vector_store(index_name="2023WC"):
    # Create output directory if it doesn't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the HTML file
    loader = BSHTMLLoader("data/2023WC.html")
    data = loader.load()
    
    # Split the text into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(data)
    
    # Initialize embeddings
    embeddings = BedrockEmbeddings(model_id='amazon.titan-embed-text-v2:0')
    
    # Create FAISS vector store
    db = FAISS.from_documents(chunks, embeddings)
    
    # Save the vector store
    db.save_local("data", index_name)
    print("Vector store created successfully!")
    
    return db
    

index = create_vector_store(index_name="2023WC")

Created a chunk of size 1599, which is longer than the specified 1000
Created a chunk of size 1249, which is longer than the specified 1000
Created a chunk of size 1296, which is longer than the specified 1000
Created a chunk of size 1255, which is longer than the specified 1000
Created a chunk of size 1170, which is longer than the specified 1000
Created a chunk of size 1463, which is longer than the specified 1000
Created a chunk of size 3698, which is longer than the specified 1000


Vector store created successfully!


In [77]:
import re
from langchain_aws import BedrockEmbeddings, ChatBedrock
from langchain_community.vectorstores import FAISS

In [93]:
# Function to clean and standardize text by removing unwanted elements
def clean_text(text):
    # Convert non-breaking spaces to regular spaces
    text = text.replace('\xa0', ' ')
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove citation references like [1], [2,3], etc.
    text = re.sub(r'\[.*?\]', '', text)
    
    # Normalize whitespace by removing extra spaces and line breaks
    text = ' '.join(text.split())
    
    return text

def rag_function(query, index):
    # Initialize Bedrock Embeddings
    embeddings = BedrockEmbeddings(model_id='amazon.titan-embed-text-v2:0')

    # Retrieve relevant documents
    retrieved_docs = index.similarity_search(query, k=2)
    retrieved_context = [clean_text(retrieved_docs[0].page_content + retrieved_docs[1].page_content)]

    # Create augmented prompt
    augmented_prompt = f"""
    Given the context below answer the question.

    Question: {query} 

    Context: {retrieved_context}

    Remember to answer only based on the context provided and not from any other source. 

    If the question cannot be answered based on the provided context, say I don't know.
    """

    # Initialize Bedrock Claude model
    llm = ChatBedrock(
        model_id="anthropic.claude-3-sonnet-20240229-v1:0",
        model_kwargs={
            "temperature": 0,
            "max_tokens": 2048
        }
    )

    # Generate response
    response = llm.invoke(augmented_prompt)

    return retrieved_context, response.content

In [108]:
def display_rag_results(context, answer):
    # Display the context and answer with markdown
    from IPython.display import Markdown

    md_text = f"""
    ### Context:
    {context}

    ### Answer:
    {answer}
    """

    display(Markdown(md_text))


In [109]:
query = "What RAG?"
context, answer = rag_function(query, index)


display_rag_results(context, answer)


    ### Context:
    ['All-rounder KL Rahul India Batter Glenn Maxwell Australia All-rounder Ravindra Jadeja India All-rounder Jasprit Bumrah India Bowler Dilshan Madushanka Sri Lanka Bowler Adam Zampa Australia Bowler Mohammed Shami India Bowler Gerald Coetzee South Africa Twelfth^ Malhotra, Sahil (18 November 2023). "Salute from the Skies, Parade of Champions, Drone Show and More… : World Cup Final Set to be a Grand Spectacle". News18. Archived from the original on 18 November 2023. Retrieved 18 November 2023. ^ a b Witney, Katya (6 July 2023). "CWC Qualifier 2023: Netherlands qualify for World Cup at Scotland\'s expense after stunning Bas de Leede heist". Wisden. London: Bloomsbury. Archived from the original on 9 July 2023. Retrieved 9 July 2023. ^ "Sri Lanka qualifies for Cricket World Cup; Zimbabwe, Scotland to scrap it out for final place". AP News. New York: Associated Press. 2 July 2023. Archived from the original on 6 July 2023. Retrieved 6 July 2023. ^ "West Indies Officially Eliminated from 2023 World Cup Race After Thumping Loss to Scotland in Historic Low". Wisden. London: Bloomsbury. 1 July 2023. Archived from the original on 4 July 2023. Retrieved 4 July 2023.']

    ### Answer:
    I don't know what "RAG" refers to based on the given context. The context appears to be about cricket players, teams, and news related to the 2023 Cricket World Cup qualifiers and preparations, but does not mention anything about "RAG".
    

In [110]:
query = "Who won the world cup?"
context, answer = rag_function(query, index)


display_rag_results(context, answer)


    ### Context:
    ['Format This was the first ICC World Cup in which penalties for slow over-rates were given to bowling sides if they did not complete their 50 overs in the stipulated time. On-field umpires could penalise the bowling team by not allowing more than four fielders outside the 30-yard circle. Pakistan\'s participation The Pakistan Cricket Board (PCB) had threatened to boycott the tournament after the Board of Control for Cricket in India (BCCI) refused to send a team to the 2023 Asia Cup scheduled in Pakistan. This issue was resolved in June 2023 after the Asian Cricket Council announced that the tournament would be hosted using a hybrid model proposed by the PCB, with nine of the 13 matches in the competition played in Sri Lanka.See also List of Cricket World Cup finals References ^ "2023 ODI World Cup shatters viewership records". ESPNcricinfo. Retrieved 15 February 2024. ^ "India to host 2023 World Cup, Champions Trophy in 2021". The Indian Express. 11 December 2017. Retrieved 15 February 2024. ^ "Outcomes from ICC Annual Conference week in London". ICC. Dubai: International Cricket Council. 13 June 2013. Archived from the original on 14 October 2017. Retrieved 22 June 2017. ^ "IPL now has window in ICC Future Tours Programme". ESPNcricinfo. 12 December 2017. Archived from the original on 25 December 2018. Retrieved 12 December 2017. ^ "ICC postpones T20 World Cup due to Covid-19 pandemic". ESPNcricinfo. 20 July 2020. Archived from the original on 23 October 2022. Retrieved 20 July 2020. ^ "Men\'s T20 World Cup postponed" (Press release). Dubai: ICC. 20 July 2020. Archived from the original on 20 July 2020. Retrieved 20 July 2020.']

    ### Answer:
    I don't know who won the 2023 Cricket World Cup based on the given context. The context provides information about the format, Pakistan's participation, and some general details about the 2023 World Cup, but does not mention the winner.
    

In [111]:
query = "What was Virat Kohli's achievement in the Cup?"
context, answer = rag_function(query, index)


display_rag_results(context, answer)


    ### Context:
    ['Cricket tournament 2023 ICC Men\'s Cricket World CupDates5 October – 19 November 2023Administrator(s)International Cricket CouncilCricket formatOne Day International (ODI)Tournament format(s)Round-robin and knockoutHost(s)IndiaChampions Australia (6th title)Runners-up IndiaParticipants10Matches48Attendance1,250,307 (26,048 per match)Player of the series Virat KohliMost runs Virat Kohli (765)Most wickets Mohammed Shami (24)Official websitecricketworldcup.com← 20192027 → The 2023 ICC Men\'s Cricket World Cup was the 13th edition of the ICC Men\'s Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. This was the fourth World Cup held in India, but the first where India was the sole host. The tournament was contested by ten national teams, maintaining the same format used in 2019. After six weeks of round-robin matches, India, South Africa, Australia, and New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad. Australia won the final by six wickets, winning their sixth Cricket World Cup title. A total of 1,250,307 spectators attended the matches, the highest number in any Cricket World Cup to date. The tournament final set viewership records in India, drawing 518 million viewers, with a peak of 57 million streaming viewers.Marketing The ICC hosted a trophy tour for 100 days prior to the tournament beginning 27 June, with the Cricket World Cup Trophy being taken to various locations around the world. The event began with the launching of the trophy into the stratosphere by Sent Into Space and landing at Modi Stadium—becoming the first sports trophy to have ever been sent into space. The ICC officially announced the mascots for the World Cup in August. The mascots were a male and female duo named "Tonk" and "Blaze" from the fictional cricketing utopia "Crictoverse". Ahead of the tournament, it was reported that an opening ceremony would take place on 4 October 2023 at the Narendra Modi Stadium in Ahmedabad, a day before the opening match at the same venue. The official theme song of the 2023 Cricket World Cup titled "Dil Jashn Bole" (transl. Heart say celebrate) was released on 20 September. The song was composed by Pritam, and was sung by Pritam, Nakash Aziz, Sreerama Chandra, Amit Mishra, Jonita Gandhi, Akasa Singh and S. P. Charan. However, the song was subject to backlash and bad reviews. The opening ceremony was cancelled and replaced by a closing ceremony ahead of the final. During this a drone show was held.']

    ### Answer:
    According to the context provided, Virat Kohli was the Player of the series in the 2023 ICC Men's Cricket World Cup, scoring the most runs (765) in the tournament.
    