<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/LLAMA4_AWS_APRIL2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model Information

* The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding.

* These Llama 4 models mark the beginning of a new era for the Llama ecosystem. We are launching two efficient models in the Llama 4 series, Llama 4 Scout, a 17 billion parameter model with 16 experts, and Llama 4 Maverick, a 17 billion parameter model with 128 experts.

* Model developer: Meta

* Model Architecture: The Llama 4 models are auto-regressive language models that use a mixture-of-experts (MoE) architecture and incorporate early fusion for native multimodality.

* Model Name	Training Data	Params	Input modalities	Output modalities	Context length	Token count	Knowledge cutoff
Llama 4 Scout (17Bx16E)	A mix of publicly available, licensed data and information from Meta’s products and services. This includes publicly shared posts from Instagram and Facebook and people’s interactions with Meta AI. Learn more in our Privacy Center.	17B (Activated) 109B (Total)	Multilingual text and image	Multilingual text and code	10M	~40T	August 2024
Llama 4 Maverick (17Bx128E)		17B (Activated) 400B (Total)	Multilingual text and image	Multilingual text and code	1M	~22T	August 2024
Supported languages: Arabic, English, French, German, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Thai, and Vietnamese.

* Model Release Date: April 5, 2025

* Status: This is a static model trained on an offline dataset. Future versions of the tuned models may be released as we improve model behavior with community feedback.

https://studio-d-yesr9g64bv2p.studio.us-east-1.sagemaker.aws/jumpstart/SageMakerPublicHub/Model/meta-vlm-llama-4-scout-17b-16e-instruct

In [None]:
!pip install colab-env --quiet

!pip install sagemaker boto3 --quiet

%pip install langchain --quiet

In [None]:
import colab_env
import os
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
region=os.getenv("AWS_DEFAULT_REGION")
output=os.getenv("AWS_DEFAULT_OUTPUT")

#print(aws_access_key_id)
#print()
#print(f"aws_access_key_id: '{aws_access_key_id}'")
#print(f"aws_secret_access_key: '{aws_secret_access_key}'")

#print(f"region: '{region}'")
#print()

In [None]:
import colab_env
import boto3
import os
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel

iam_client = boto3.client("iam")

role = iam_client.get_role(
    RoleName=os.getenv("ROLENAME")
)

ROLE_ARN = role['Role']['Arn']


from sagemaker.jumpstart.model import JumpStartModel

#'ml.p5.48xlarge for endpoint usage
llm_model_id = "meta-vlm-llama-4-scout-17b-16e-instruct"
llm_model = JumpStartModel(model_id=llm_model_id, env={"MAX_SEQ_LEN": "1048576"}, role=ROLE_ARN, region='us-east-1')
llm_predictor = llm_model.deploy(accept_eula=True)


In [None]:
#this is the model endpoint NAME, not the ARN
llm_model_endpoint_name = llm_predictor.endpoint_name
llm_model_endpoint_name

In [None]:
####  CASE#1
import json
#query = "who is the best French Poet?"
query = "Write a program to compute factorial in python:"


# Create a boto3 client for SageMaker runtime
sm_client = boto3.client('runtime.sagemaker')


### WITH PARAMETRS
n=5
MNT=512*n
model_kwargs={"max_new_tokens": MNT, "temperature": 0.9}
input_data = ({"inputs": query, "parameters" : {**model_kwargs}})

response = sm_client.invoke_endpoint(EndpointName=llm_model_endpoint_name, Body=json.dumps(input_data), ContentType="application/json")

# Decode the response from the model
response_body = json.loads(response['Body'].read().decode('utf-8'))
#print(response_body)

print(f'Query #1:', query)
print('\n')

# Check if the expected key exists before accessing it
if 'generated_text' in response_body:
    print(f'Response #1:', response_body['generated_text'])  # Access directly if it's a dictionary
    print('\n')
elif isinstance(response_body, list) and len(response_body) > 0 and 'generated_text' in response_body[0]:
    print(f'Response #2:', response_body[0]['generated_text'])  # Access using index if it's a list of dictionaries
    print('\n')
else:
    print("Unexpected response format:", response_body)

In [None]:
#### CASE#2

#query = "who is the best French Poet?"
query = "I bought an ice cream for 6 kids. Each cone was $1.25 and I paid with a $10 bill. How many dollars did I get back? Explain first before answering."


# Create a boto3 client for SageMaker runtime
sm_client = boto3.client('runtime.sagemaker')

# Prepare the input for the model
#input_data = {"inputs": query}

### WITH PARAMETRS
n=5
MNT=512*n
model_kwargs={"max_new_tokens": MNT, "temperature": 0.9}
input_data = ({"inputs": query, "parameters" : {**model_kwargs}})

response = sm_client.invoke_endpoint(EndpointName=llm_model_endpoint_name, Body=json.dumps(input_data), ContentType="application/json")

# Decode the response from the model
response_body = json.loads(response['Body'].read().decode('utf-8'))

print(f'Query #2:', query)
print('\n')


if 'generated_text' in response_body:
    print(f'Response #1:', response_body['generated_text'])
elif isinstance(response_body, list) and len(response_body) > 0 and 'generated_text' in response_body[0]:
    print(f'Response #2:', response_body[0]['generated_text'])
else:
    print("Unexpected response format or empty response:", response_body)

In [None]:
## CASE#3

import colab_env
import os
import matplotlib.pyplot as plt
import cv2
import urllib.request
import urllib
from matplotlib.pyplot import figure, imshow, axis
from matplotlib.image import imread
import IPython
import pytz
from datetime import datetime
import json
from pathlib import Path


def chat_gpt_vision(image_url):
    # Prepare the input for the model, including the image URL and the prompt
    input_data = {
        "inputs": f"Describe the image: {image_url}",
        "parameters": {
            "max_new_tokens": 256,  # Adjust as needed
            "temperature": 0.7      # Adjust as needed
        }
    }

    # Send the request to the SageMaker endpoint
    response = sm_client.invoke_endpoint(
        EndpointName=llm_model_endpoint_name,
        Body=json.dumps(input_data),
        ContentType="application/json"
    )

    # Decode and return the model's response
    response_body = json.loads(response['Body'].read().decode('utf-8'))
    return response_body.get('generated_text', '')  # Handle potential missing key


# Nature
prompt =  "What’s in this image?"
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

print()
print('The image url is here: %s'%image_url)

nature="/content/gdrive/MyDrive/datasets/Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
print()

testim = cv2.imread(nature)
plt.imshow(testim)
plt.show()
print()

image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
description = chat_gpt_vision(image_url)
print(description)


## turing award recipient ########
prompt =  "Describe the image?"
image_url = "https://awards.acm.org/binaries/content/gallery/acm/ctas/awards/turing-2018-bengio-hinton-lecun.jpg"
#url="https://awards.acm.org/binaries/content/gallery/acm/ctas/awards/turing-2018-bengio-hinton-lecun.jpg"

print()
print('The image of the 2018 Turing Award recipients is here: %s'%image_url)
print()

award="/content/gdrive/MyDrive/datasets/turing-2018-bengio-hinton-lecun.jpg"
testim = cv2.imread(award)
plt.imshow(testim)
plt.show()
print()

description = chat_gpt_vision(image_url)
print(description)


# CLEAN UP

In [None]:
#  Frank Morales created this cell on December 14, 2023; it fully allows automatically the deletion of endpoints, models, and endpoint configurations.

import colab_env
import os

aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region=os.getenv("AWS_DEFAULT_REGION")
aws_output=os.getenv("AWS_DEFAULT_OUTPUT")

import boto3

sagemaker_client = boto3.client('sagemaker', region_name=aws_region)

def cleanup_sagemaker_resources(resource_name,resourceid):

    if resourceid==0:
       response=sagemaker_client.list_endpoints()
    elif resourceid==1:
         response=sagemaker_client.list_models()
    elif resourceid==2:
         response=sagemaker_client.list_endpoint_configs()

    print(resource_name)

    number_of_endpoints=len(response['%s'%resource_name])
    for i in range(number_of_endpoints):
        resource_nametmp='%s'%resource_name[0:len(resource_name)-1]
        print('%sName'%resource_nametmp)
        print(response['%s'%resource_name][i]['%sName'%resource_nametmp])

        if resourceid==0:
           endpoint_name=response['%s'%resource_name][i]['%sName'%resource_nametmp]
           sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
        elif resourceid==1:
           sagemaker_client.delete_model(ModelName=response['Models'][i]['ModelName'])
        elif resourceid==2:
           sagemaker_client.delete_endpoint_config(EndpointConfigName=response['EndpointConfigs'][i]['EndpointConfigName'])

    print("\n==================================\n")


cleanup_sagemaker_resources('Endpoints',0)
cleanup_sagemaker_resources('Models',1)
cleanup_sagemaker_resources('EndpointConfigs',2)