# Connecting to AWS

In [43]:
# Use the ListFoundationModels API to show the models that are available in your region.
import boto3

In [44]:
kb_client = boto3.client('bedrock-agent')
kb_list_response = kb_client.list_knowledge_bases(
    maxResults=100
)
kb_list_response

{'ResponseMetadata': {'RequestId': '4a005f1f-7b37-4f94-92f5-c19a16c1f233',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Wed, 30 Oct 2024 05:00:21 GMT',
   'content-type': 'application/json',
   'content-length': '160',
   'connection': 'keep-alive',
   'x-amzn-requestid': '4a005f1f-7b37-4f94-92f5-c19a16c1f233',
   'x-amz-apigw-id': 'AcqH5HtAPHcEWKA=',
   'x-amzn-trace-id': 'Root=1-6721bd65-6af7cfab692a80333bc7546f'},
  'RetryAttempts': 0},
 'knowledgeBaseSummaries': [{'knowledgeBaseId': '1JORBFKUYS',
   'name': 'aerospace-chatbot-ams-demo',
   'status': 'ACTIVE',
   'updatedAt': datetime.datetime(2024, 10, 29, 17, 47, 24, 456721, tzinfo=tzutc())}]}

In [45]:
# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/knowledge-bases 
knowledgeBaseId = '1JORBFKUYS'

# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/models
modelArn = 'anthropic.claude-3-5-sonnet-20241022-v2:0'  

# Retrieval parameters
numberOfResults = 8

In [46]:
config = {
    "input": {
        "text": "What can you tell me about the spacecraft mechanism testing in the Molsink facility"
    },
    "retrieveAndGenerateConfiguration": {
        "knowledgeBaseConfiguration": {
            "knowledgeBaseId": knowledgeBaseId,
            "modelArn": modelArn,
            'retrievalConfiguration': {
                'vectorSearchConfiguration': {
                    'numberOfResults': numberOfResults
                }
            }
        },
        "type": "KNOWLEDGE_BASE"
    }
}

config['retrieveAndGenerateConfiguration']

{'knowledgeBaseConfiguration': {'knowledgeBaseId': '1JORBFKUYS',
  'modelArn': 'anthropic.claude-3-5-sonnet-20241022-v2:0',
  'retrievalConfiguration': {'vectorSearchConfiguration': {'numberOfResults': 8}}},
 'type': 'KNOWLEDGE_BASE'}

In [47]:
rag_client = boto3.client('bedrock-agent-runtime')
rag_response = rag_client.retrieve_and_generate(
    input=config['input'], 
    retrieveAndGenerateConfiguration=config['retrieveAndGenerateConfiguration']
)
rag_response

{'ResponseMetadata': {'RequestId': '93698110-c00d-4619-bcf7-5c39f3ab0211',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Wed, 30 Oct 2024 05:00:39 GMT',
   'content-type': 'application/json',
   'content-length': '11375',
   'connection': 'keep-alive',
   'x-amzn-requestid': '93698110-c00d-4619-bcf7-5c39f3ab0211'},
  'RetryAttempts': 0},
 'citations': [{'generatedResponsePart': {'textResponsePart': {'span': {'end': 349,
      'start': 0},
     'text': 'The Molsink facility at the Jet Propulsion Laboratory is a 10-foot diameter triple-walled extreme-high-vacuum chamber designed to test spacecraft mechanisms. It features walls that use both cryogenic and chemical pumping to handle gases produced during testing. The facility can achieve vacuum levels of 10^-8 torr and has inner walls cooled to 14°K.'}},
   'retrievedReferences': [{'content': {'text': 'Within the next year, it is planned to complete the experiments above and to perform calibration tests on various lunar atmosphere ma

In [49]:
rag_response['citations'][0]

# Print all retrievedReferences from each citation
i=0
for citation in rag_response['citations']:
    print("Retrieved References:")
    for ref in citation['retrievedReferences']:
        print(f"- {ref}")
        i+=1
    print("\n")
i

Retrieved References:
- {'content': {'text': 'Within the next year, it is planned to complete the experiments above and to perform calibration tests on various lunar atmosphere mass spectrometers and development tests of several Mariner Mars 1971 mechanisms. A Molsink calibration and evaluation program, which was started several months ago but deferred because of the press of flight project tests, is now planned to be completed by July 1969. This program is primarily to make an experimental verification of the Molsink Factor (see Section III) by allowing various gases to leave the center of the chamber under controlled conditions and measuring the amount that returns. ## The Facility The Molsink facility, illustrated in Fig. 1, is a 10-ft-diam, triple-walled, extreme-high-vacuum chamber with walls that cryogenically and chemically pump gases produced by the test item. The cryopumping is accomplished by a spherical molecular trap (Moltrap) wedge fin array (Figs. 2 and 3).'}, 'location':

6

In [51]:
print(rag_response['output'])

{'text': 'The Molsink facility at the Jet Propulsion Laboratory is a 10-foot diameter triple-walled extreme-high-vacuum chamber designed to test spacecraft mechanisms. It features walls that use both cryogenic and chemical pumping to handle gases produced during testing. The facility can achieve vacuum levels of 10^-8 torr and has inner walls cooled to 14°K. A key feature of the Molsink is its ability to simulate the molecular sink effect of space. This is measured by the Molsink Factor, which represents the ratio of molecules leaving a test item compared to those returning to it. The facility is highly efficient - for every 10,000 molecules that leave a test item, only a few return to it. The facility uses a Molecular Trap (Moltrap) constructed of aluminum sheets welded to aluminum tubes, cooled by a 1200-W helium refrigerator. It also employs an electron-beam titanium sublimator for chemical pumping. The facility has been used to test Mariner Mars 1969 mechanisms, conduct friction ph

# Pinecone database management

In [14]:
import os
from dotenv import load_dotenv
import pinecone

# Load environment variables from .env file
load_dotenv()

# Initialize Pinecone client
pc = pinecone.Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

index_name='aws-bedrock-ams-demo'

# Delete all vectors from each index but keep the index itself
index = pc.Index(index_name)
# Delete all vectors but preserve the index structure
try:
    index.delete(delete_all=True)
except Exception as e:
    print(f"Error deleting index (probably empty already): {e}")

Error deleting index (probably empty already): (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 30 Oct 2024 02:48:38 GMT', 'Content-Type': 'application/json', 'Content-Length': '55', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '88', 'x-pinecone-request-id': '3920809686156093469', 'x-envoy-upstream-service-time': '89', 'server': 'envoy'})
HTTP response body: {"code":5,"message":"Namespace not found","details":[]}



# Process PDFs

In [17]:
import os
from PyPDF2 import PdfReader, PdfWriter

def split_pdf(input_path, pages_per_chunk=25):
    """Split a PDF into chunks of specified number of pages"""
    reader = PdfReader(input_path)
    total_pages = len(reader.pages)
    
    # Get base filename without extension
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    
    for chunk_start in range(0, total_pages, pages_per_chunk):
        writer = PdfWriter()
        chunk_end = min(chunk_start + pages_per_chunk, total_pages)
        
        # Add pages for this chunk
        for page_num in range(chunk_start, chunk_end):
            writer.add_page(reader.pages[page_num])
            
        # Save chunk to new PDF
        output_path = f"{base_name}_chunk_{chunk_start//pages_per_chunk + 1}.pdf"
        with open(output_path, "wb") as output_file:
            writer.write(output_file)

split_pdf('./AMS_1969_reocr.pdf')