## Set up

In [99]:
# Install required libraries
%pip install --upgrade pip
%pip install boto3 datasets pillow tqdm --upgrade --quiet
%pip install langchain 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [100]:
# Restart kernel to ensure updated packages take effect
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [101]:
import boto3
import os
import json
import time
import shutil
from tqdm import tqdm
from datasets import load_dataset
from PIL import Image
import io
import uuid
import warnings
import pprint
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.function_step import step
from sagemaker.s3 import S3Uploader
from langchain.prompts import PromptTemplate

warnings.filterwarnings('ignore')

In [102]:
# Set AWS region
region = "us-west-2"  # Llama 3.2 fine-tuning is currently only available in us-west-2

# Create AWS clients
session = boto3.session.Session(region_name=region)
s3_client = session.client('s3')
sts_client = session.client('sts')
bedrock = session.client(service_name="bedrock")

# Get account ID
account_id = sts_client.get_caller_identity()["Account"]

# Generate bucket name with account ID for uniqueness
bucket_name = f"vlm-accessibility-{account_id}-{region}"

print(f"Account ID: {account_id}")
print(f"Bucket name: {bucket_name}")

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Account ID: 957849199024
Bucket name: vlm-accessibility-957849199024-us-west-2


In [103]:
try:
    if region == 'us-east-1':
        s3_client.create_bucket(
            Bucket=bucket_name
        )
    else:
        # For all other regions, specify the LocationConstraint
        s3_client.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
        )
    print(f"Bucket {bucket_name} created successfully")
except s3_client.exceptions.BucketAlreadyExists:
    print(f"Bucket {bucket_name} already exists")
except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print(f"Bucket {bucket_name} already owned by you")
except Exception as e:
    print(f"Error creating bucket: {e}")

Bucket vlm-accessibility-957849199024-us-west-2 already owned by you


In [104]:
ROLE_ARN   = "arn:aws:iam::957849199024:role/Qwen7bVisionFineTuningRole-1753405295"
INST_TYPE  = "ml.m5.xlarge"

## Image-upload-to-s3-step

- Load the data from Hugging Face.
- Extract only the image files from the dataset and upload them to S3 at s3://{bucket_name}/images/{subset}/{image_id}.

In [105]:
@step(
    name="upload-images-to-s3-step",
    role=ROLE_ARN,
    instance_type=INST_TYPE,
    keep_alive_period_in_seconds=300
)
def upload_images_to_s3(subset):
    """
    Hugging Face에서 다운로드한 데이터 중 이미지에 해당하는 파일을 S3에 업로드합니다.
    """
    dataset = load_dataset("doodoo77/For_VLM_accessibility")
    for i, example in enumerate(tqdm(dataset)):
        try:
            # 1) PIL Image → PNG bytes
            buffer = io.BytesIO()
            example["image"].save(buffer, format="PNG")
            png_data = buffer.getvalue()      # bytes 추출
            
            # 2) S3 경로 생성
            path_name = f"{i:03d}.png" #example['output']['image_id']
            s3_uri = f"s3://{bucket_name}/images/{subset}/{path_name}"
            
            # 3) bytes 직접 업로드
            S3Uploader.upload_bytes(
                png_data,
                s3_uri,
                kms_key=None
            )

        except Exception as e:
            print(f"[Error] {path_name} 업로드 실패: {e}")
    
    return f"s3://{bucket_name}/images/{subset}"   

## Format Date for Fine-tuning

- Convert data into the Amazon Bedrock Conversation JSONL format

In [106]:
# 아래 prompt들은 새롭게 다시 수정될 예정임
system_prompt = """
    A conversation between User and Assistant. The user asks a question, and the Assistant solves it.
    The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.
    The reasoning process and answer are enclosed within <begin_of_thought><end_of_thought> and <begin_of_solution><end_of_solution> tags,
    respectively, i.e., <begin_of_thought>reasoning process here<end_of_thought><begin_of_solution>answer here<end_of_solution>.
    """

user_prompt = PromptTemplate(
    input_variables=["html"],
    template="""
    Please conduct an accessibility evaluation of the provided image and related HTML snippet.
    """)

assisstant_prompt = PromptTemplate(
    input_variables=["rationale", "eval_items", "eval_reason", "plan"],
    template="""
    <|begin_of_thought|>
    {rationale}
    <|end_of_thought|>
    <|begin_of_solution|>
    [평가 요소]: {eval_items}
    [평가 근거]: {eval_reason}
    [개선 방안]:
    {plan} 
    <|end_of_solution|>
    """)

def assistant_prompts(outputs):
    prompt = ''
    for output in outputs:
        prompt += assisstant_prompt.format(
            rationale=output["rationale"],
            eval_items=output["eval_items"],
            eval_reason=output["eval_reason"],
            plan=output["plan"]
        ) + "\n\n"
    return "<|im_start|>system\n" + prompt.strip() + "\n<|im_end|>"

In [107]:
def convert_to_bedrock_jsonl(html, output, s3_uri, account_id):
    return {
        "schemaVersion": "bedrock-conversation-2024",
        "system": [
            {
                "text": system_prompt
            }
        ],
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "text": user_prompt.format(html) #user_prompt(html)
                    },
                    {
                        "image": {
                            "format": "png",
                            "source": {
                                "s3Location": {
                                    "uri": s3_uri,
                                    "bucketOwner": account_id
                                }
                            }
                        }
                    }
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "text": assistant_prompts(output)
                    }
                ]
            }
        ]
    }

In [108]:
def write_jsonl_file(subset, image_folder):
    output_file = f"{subset}.jsonl"
    dataset = load_dataset("doodoo77/For_VLM_accessibility", split=subset)

    with open(output_file, "w") as f:
        for i, example in enumerate(dataset):
            # build the S3 URI for this example's image
            image_id = f"{i:03d}.png" #example['output']['image_id']
            s3_uri = f"{image_folder}/{image_id}"

            # convert just this example (not the whole dataset!)
            record = convert_to_bedrock_jsonl(example["html"], example["output"], s3_uri, account_id)
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    
    return output_file

In [109]:
def upload_jsonl_to_s3(output_file, subset):
    """ Uploads the JSONL file to S3.
    """
    import boto3
    region = "us-west-2"  # Llama 3.2 fine-tuning is currently only available in us-west-2

    # Create AWS clients
    session = boto3.session.Session(region_name=region)
    s3_client = session.client('s3')

    s3_client.upload_file(
        Filename=output_file,
        Bucket=bucket_name,
        Key=f"data/{subset}.jsonl"
    )


In [110]:
@step(
    name="upload-jsonl-to-s3-step",
    role=ROLE_ARN,
    instance_type=INST_TYPE,
    keep_alive_period_in_seconds=300
)
def upload_jsonl_to_s3(image_folder, subset):
    """
    Convert each example in the dataset to Bedrock JSONL format (pointing to its
    image in S3) and upload the resulting .jsonl file back to S3.
    """
    output_file = write_jsonl_file(subset, image_folder)

    # upload the completed JSONL file
    upload_jsonl_to_s3(output_file, subset)

    return f"s3://{bucket_name}/data/{output_file}"

## Create IAM Role for Model Fine-tuning

In [111]:
# ───────────────────────────────────────────
# 1) 세션·기본 변수
# ───────────────────────────────────────────
#kms_key_arn  = "arn:aws:kms:us-west-2:957849199024:key/d416e0ae-4253-4dfd-93a9-af3776ab5feb"     # (옵션) SSE-KMS 키
timestamp    = int(time.time())

# ───────────────────────────────────────────
# 2) Execution Role (신뢰·권한 정책)
# ───────────────────────────────────────────
trust_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "bedrock.amazonaws.com"
            },
            "Action": "sts:AssumeRole",
            "Condition": {
                "StringEquals": {
                    "aws:SourceAccount": account_id
                },
                "ArnEquals": {
                    "aws:SourceArn": f"arn:aws:bedrock:us-west-2:{account_id}:model-customization-job/*"
                }
            }
        }
    ] 
}

access_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:GetObject",
                "s3:PutObject",
                "s3:ListBucket",
                "s3:GetBucketLocation"
            ],
            "Resource": [
                f"arn:aws:s3:::{bucket_name}",
                f"arn:aws:s3:::{bucket_name}/*"
            ]
        }
    ]
}

iam = session.client("iam")
s3  = session.client("s3")
kms = session.client("kms")

role_name   = f"Qwen7bVisionFineTuningRole-{timestamp}"
policy_name = f"Qwen7bVisionFineTuningPolicy-{timestamp}"

# ───────────────────────────────────────────
# 3) Role + Policy 생성·연결
# ───────────────────────────────────────────
try:
    response = iam.create_role(
        RoleName=role_name,
        AssumeRolePolicyDocument=json.dumps(trust_policy_doc),
        Description="Role for Qlora Qwen7b vision model with Amazon Bedrock"
    )
    
    role_arn = response["Role"]["Arn"]
    print(f"Created role: {role_arn}")
    
    # Create policy
    response = iam.create_policy(
        PolicyName=policy_name,
        PolicyDocument=json.dumps(access_policy_doc)
    )
    
    policy_arn = response["Policy"]["Arn"]
    print(f"Created policy: {policy_arn}")
    
    # Attach policy to role
    iam.attach_role_policy(
        RoleName=role_name,
        PolicyArn=policy_arn
    )
    
    print(f"Attached policy to role")
    
except Exception as e:
    print(f"Error creating IAM resources: {e}")

# Allow time for IAM role propagation
print("Waiting for IAM role to propagate...")
time.sleep(10)




Created role: arn:aws:iam::957849199024:role/Qwen7bVisionFineTuningRole-1753774560
Created policy: arn:aws:iam::957849199024:policy/Qwen7bVisionFineTuningPolicy-1753774560
Attached policy to role
Waiting for IAM role to propagate...


## Fine-tuning VLM

In [112]:
# Define step for custom training the model
@step(
    name="model-training-step",
    role=ROLE_ARN,
    instance_type=INST_TYPE,
    keep_alive_period_in_seconds=300
)
def train(
    custom_model_name: str, 
    training_job_name: str, 
    base_model_id: str, 
    upload_jsonl_to_s3_step_result: tuple
) -> str:
    # Define the hyperparameters for fine-tuning Titan text model
    hyper_parameters = {
        "epochCount": "2",       # Number of training epochs
        "batchSize": "1",        # Batch size for training
        "learningRate": "0.00001"  # Learning rate
    }
    
    # Specify your data path for training, validation(optional) and output
    training_data_config = {"s3Uri": upload_jsonl_to_s3_step_result} #f"s3://{bucket_name}/data/train.jsonl"
    print(f"Training data config: {training_data_config}")

    # validation_data_config = {
    #     "validators": [
    #         {
    #             # "name": "validation",
    #             "s3Uri": upload_jsonl_to_s3_step_result
    #         }
    #     ]
    # }
    # print(f"Validation data config: {validation_data_config}")

    output_data_config = { 
        "s3Uri": f"s3://{bucket_name}/output/"
    }

    bedrock = boto3.client(service_name="bedrock")

    print("Start training....")

    # Create the customization job
    training_job_response = bedrock.create_model_customization_job(
        customizationType="FINE_TUNING",
        jobName=training_job_name,
        customModelName=custom_model_name,
        roleArn=role_arn,
        baseModelIdentifier=base_model_id,
        hyperParameters=hyper_parameters,
        trainingDataConfig=training_data_config,
        # validationDataConfig=validation_data_config,
        outputDataConfig=output_data_config,
    )
    print(training_job_response)

    job_status = bedrock.get_model_customization_job(jobIdentifier=training_job_name)["status"]
    print(job_status)

    while job_status == "InProgress":
        time.sleep(60)
        job_status = bedrock.get_model_customization_job(jobIdentifier=training_job_name)["status"]
        print(job_status)

    fine_tune_job = bedrock.get_model_customization_job(jobIdentifier=training_job_name)
    pprint.pp(fine_tune_job)
    output_job_name = "model-customization-job-" + fine_tune_job["jobArn"].split("/")[-1]
    print(f"output_job_name: {output_job_name}")

    model_id = bedrock.get_custom_model(modelIdentifier=custom_model_name)["modelArn"]

    print(f"Model id: {model_id}")
    return model_id

## Create provisioned throughput

In [113]:
# Define step for creating Provisioned throughput for the custom model
@step(
    name="create-provisioned-throughput-step",
    role=ROLE_ARN,
    instance_type=INST_TYPE,
    keep_alive_period_in_seconds=300,
)
def create_prov_thruput(model_id: str, provisioned_model_name: str) -> str:
    bedrock = boto3.client(service_name="bedrock")

    provisioned_model_id = bedrock.create_provisioned_model_throughput(
        modelUnits=1, provisionedModelName=provisioned_model_name, modelId=model_id
    )["provisionedModelArn"]

    status = bedrock.get_provisioned_model_throughput(provisionedModelId=provisioned_model_id)[
        "status"
    ]

    print(status)

    while status == "Creating":
        time.sleep(60)
        status = bedrock.get_provisioned_model_throughput(provisionedModelId=provisioned_model_id)[
            "status"
        ]
        print(status)
        time.sleep(60)

    return provisioned_model_id

## Test the custom model

In [114]:
# Use the native inference API to send a text message to Meta Llama 3.
@step(
    name="model-testing-step",
    role=ROLE_ARN,
    instance_type=INST_TYPE,
    keep_alive_period_in_seconds=300,
)
def test_model(provisioned_model_id: str) -> tuple:
    import boto3
    import json
    import base64
    from botocore.exceptions import ClientError

    """
    Test the fine-tuned model with a sample input.
    """
    # Create a Bedrock Runtime client in the AWS Region of your choice.
    client = boto3.client("bedrock-runtime", region_name="us-west-2")

    # 1. 데이터셋 로드
    dataset = load_dataset("doodoo77/For_VLM_accessibility")
    example = dataset["train"][0]  # 또는 'test'[0], 원하는 split에서 가져오세요

    # Set the model ID, e.g., Llama 3 70b Instruct.
    model_id = provisioned_model_id
    html = example["html"]
    # Embed the prompt in Llama 3's instruction format.
    formatted_prompt = f"""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>
    {system_prompt}<|eot_id|>
    <|start_header_id|>user<|end_header_id|>
    {user_prompt.format(html)}<|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """      


    # 2. 이미지 → 바이트(PNG) → Base64 인코딩
    buffer = io.BytesIO()
    example["image"].save(buffer, format="PNG")
    png_data = buffer.getvalue()
    base64_str = base64.b64encode(png_data).decode("utf-8")

    # 3. Llama 3.2 스타일 입력용 리스트 형태
    images = [base64_str]

    # Format the request payload using the model's native structure.
    native_request = {
        "prompt": formatted_prompt,
        "max_gen_len": 512,
        "temperature": 0.5,
        "images": images,
    }

    # Convert the native request to JSON.
    request = json.dumps(native_request)
    response = client.invoke_model(modelId=model_id, body=request)

    # Decode the response body.
    model_response = json.loads(response["body"].read()) 

    # Extract and print the response text.
    response_text = model_response["generation"]
    print(response_text)
    return response_text




## Create the sagemaker pipeline

In [None]:
pipeline_name = "VLM-fine-tune-pipeline"

path = "doodoo77/For_VLM_accessibility"  # Hugging Face dataset path
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")

custom_model_name = f"llama32-multimodel-{timestamp}"
training_job_name = f"llama32-multimodal-ft-{timestamp}"
base_model_id = "meta.llama3-2-11b-instruct-v1:0"  # Llama 3.2 vision model ID
provisioned_model_name = f"llama32-multimodal-prov-{timestamp}"


image_folder = upload_images_to_s3("train")
upload_jsonl_to_s3_step_result = upload_jsonl_to_s3(image_folder, "train")
model_id = train(
    custom_model_name=custom_model_name, 
    training_job_name=training_job_name, 
    base_model_id=base_model_id, 
    upload_jsonl_to_s3_step_result=upload_jsonl_to_s3_step_result
)
create_prov_thruput_response = create_prov_thruput(
    model_id=model_id,
    provisioned_model_name=provisioned_model_name
)

test_model_response = test_model(
    provisioned_model_id=create_prov_thruput_response
)   

# Create the SageMaker pipeline
pipeline = Pipeline(
    name=pipeline_name,
    steps=[
        image_folder,
        upload_jsonl_to_s3_step_result,
        model_id,
        create_prov_thruput_response,
        test_model_response
    ],
    parameters=path
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [116]:
pipeline.upsert(role_arn)

2025-07-29 07:36:16,145 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-west-2-957849199024/VLM-fine-tune-pipeline/upload-images-to-s3-step/2025-07-29-07-36-13-926/function
2025-07-29 07:36:16,674 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-west-2-957849199024/VLM-fine-tune-pipeline/upload-images-to-s3-step/2025-07-29-07-36-13-926/arguments
2025-07-29 07:36:18,455 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-west-2-957849199024/VLM-fine-tune-pipeline/upload-jsonl-to-s3-step/2025-07-29-07-36-13-926/function
2025-07-29 07:36:19,055 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-west-2-957849199024/VLM-fine-tune-pipeline/upload-jsonl-to-s3-step/2025-07-29-07-36-13-926/arguments
2025-07-29 07:36:19,841 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-west-2-9578

{'PipelineArn': 'arn:aws:sagemaker:us-west-2:957849199024:pipeline/VLM-fine-tune-pipeline',
 'PipelineVersionId': 3,
 'ResponseMetadata': {'RequestId': '7c72cebd-a5c3-4238-bdc7-132178f3246c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7c72cebd-a5c3-4238-bdc7-132178f3246c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '112',
   'date': 'Tue, 29 Jul 2025 07:36:34 GMT'},
  'RetryAttempts': 0}}

In [117]:
execution = pipeline.start()

In [118]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-west-2:957849199024:pipeline/VLM-fine-tune-pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-west-2:957849199024:pipeline/VLM-fine-tune-pipeline/execution/fubaupev2olz',
 'PipelineExecutionDisplayName': 'execution-1753774609344',
 'PipelineExecutionStatus': 'Executing',
 'PipelineExperimentConfig': {'ExperimentName': 'vlm-fine-tune-pipeline',
  'TrialName': 'fubaupev2olz'},
 'CreationTime': datetime.datetime(2025, 7, 29, 16, 36, 49, 277000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 7, 29, 16, 36, 49, 277000, tzinfo=tzlocal()),
 'CreatedBy': {'IamIdentity': {'Arn': 'arn:aws:iam::957849199024:user/doodoo77',
   'PrincipalId': 'AIDA56BCC4GYA5YKXUHTR'}},
 'LastModifiedBy': {'IamIdentity': {'Arn': 'arn:aws:iam::957849199024:user/doodoo77',
   'PrincipalId': 'AIDA56BCC4GYA5YKXUHTR'}},
 'PipelineVersionId': 3,
 'ResponseMetadata': {'RequestId': 'ba04d8d5-33d8-4aaf-b701-7e6f0b2d35b3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': 

Wait for the pipeline to finish excution.

In [121]:
from botocore.exceptions import WaiterError
import boto3

sm = boto3.client("sagemaker", region_name="us-west-2")
arn = "<파이프라인 실행 ARN>"

waiter = sm.get_waiter("pipeline_execution_complete")
try:
    waiter.wait(PipelineExecutionArn=arn)
    print("✅ 파이프라인 완료")
except WaiterError:
    resp = sm.describe_pipeline_execution(PipelineExecutionArn=arn)
    print("❌ 상태:", resp["PipelineExecutionStatus"])
    print("실패 사유:", resp.get("FailureReason", "제공된 실패 사유 없음"))


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [120]:
%%time
execution.wait(delay=60, max_attempts=250)

In [None]:
execution.list_steps()

In [None]:
print(execution.result(step_name="model-testing-step"))