<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Fine_Tune_squad2_Mistral_Sagemaker_Jumpstart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env --quiet
#!pip install ipywidgets==7.0.0 --quiet
!pip install sagemaker boto3 --quiet

#%pip install langchain==0.0.309 --quiet --root-user-action=ignore
%pip install langchain --quiet


In [None]:
import boto3
import colab_env
import os
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel

aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
region=os.getenv("AWS_DEFAULT_REGION")
output=os.getenv("AWS_DEFAULT_OUTPUT")

print(aws_access_key_id)
print(aws_secret_access_key)
print(region)
print(output)

## ANALYTICS

In [None]:
import sagemaker
from sagemaker.analytics import TrainingJobAnalytics
import pandas as pd
from sagemaker.jumpstart.estimator import JumpStartEstimator
import boto3
import os
from sagemaker.jumpstart.model import hyperparameters
import matplotlib.pyplot as plt

def analyze_eval_loss_and_plot(estimator, eval_loss_metric_name=None):
    """
    Fetches and plots evaluation loss from a SageMaker training job started by a JumpStartEstimator.

    Args:
        estimator (JumpStartEstimator): The trained JumpStartEstimator object.
        eval_loss_metric_name (str, optional): The name of the evaluation loss metric.
                                             If None, it will attempt to find a common
                                             evaluation loss metric name.
                                             Defaults to None.
    """
    try:
        # 1. Get the training job name
        if estimator.latest_training_job:
            training_job_name = estimator.latest_training_job.job_name
        else:
            print("Error: No training job found for this estimator.")
            return

        # 2. Determine the evaluation loss metric name
        if eval_loss_metric_name is None:
            # Default evaluation loss metric names to try
            default_names = ['validation:loss', 'eval_loss', 'eval:loss']
            analyzer = TrainingJobAnalytics(training_job_name=training_job_name)
            available_metrics = analyzer.dataframe()['metric_name'].unique()
            eval_loss_metric_name = None
            for name in default_names:
                if name in available_metrics:
                    eval_loss_metric_name = name
                    break
            if eval_loss_metric_name is None:
                print("Warning: Could not find a standard evaluation loss metric name.")
                print("Available metrics:", available_metrics)
                return

        # 3. Get the evaluation loss metric
        analyzer = TrainingJobAnalytics(training_job_name=training_job_name,
                                        metric_names=[eval_loss_metric_name])
        df = analyzer.dataframe()

        if df.empty:
            print(f"Warning: No data found for metric '{eval_loss_metric_name}'.")
            return

        # 4. Plotting the evaluation loss over time
        plt.plot(df['timestamp'], df['value'])
        plt.xlabel('Timestamp')
        plt.ylabel('Evaluation Loss')
        plt.title('Evaluation Loss Over Time')
        plt.show()

    except Exception as e:
        print(f"Error analyzing training job: {e}")
        return

# --- Complete Example Usage with JumpStartEstimator ---

# 1. Instantiate and train your JumpStartEstimator

# Get IAM role ARN
iam_client = boto3.client("iam")
role = iam_client.get_role(RoleName=os.getenv("ROLENAME"))
ROLE_ARN = role['Role']['Arn']

model_id = 'huggingface-llm-mixtral-8x7b'
model_version = '1.2.0'

# Retrieve default hyperparameters and then override
my_hyperparameters = hyperparameters.retrieve_default(model_id=model_id, model_version=model_version)
my_hyperparameters["epoch"] = "2"
my_hyperparameters["per_device_train_batch_size"] = "2"
my_hyperparameters["gradient_accumulation_steps"] = "2"
my_hyperparameters["instruction_tuned"] = "True"

instruction_tuned_estimator = JumpStartEstimator(
    model_id=model_id,
    model_version=model_version,
    hyperparameters=my_hyperparameters,
    instance_type="ml.g5.12xlarge",  # Or your desired instance type
    role=ROLE_ARN,
)

instruction_tuned_estimator.fit()  # Train the estimator

# 2. Analyze and plot the evaluation loss
analyze_eval_loss_and_plot(instruction_tuned_estimator)

In [None]:
import sagemaker
from sagemaker.analytics import TrainingJobAnalytics
import pandas as pd
from sagemaker.jumpstart.estimator import JumpStartEstimator
import boto3
import os
from sagemaker.jumpstart.model import hyperparameters  # Import hyperparameters

def analyze_jumpstart_training_job_metrics(estimator, metrics_to_analyze=None):
    """
    Fetches and analyzes metrics from a SageMaker training job started by a JumpStartEstimator.

    Args:
        estimator (JumpStartEstimator): The trained JumpStartEstimator object.
        metrics_to_analyze (list, optional): A list of specific metric names to retrieve.
                                             If None, all available metrics are fetched.
                                             Defaults to None.

    Returns:
        pandas.DataFrame: A DataFrame containing the training job metrics, or None
                          if there's an error or if the estimator hasn't trained.
    """
    try:
        # 1. Get the training job name from the JumpStartEstimator
        # Check if a training job has been run
        if estimator.latest_training_job:
            training_job_name = estimator.latest_training_job.job_name
        else:
            print("Error: No training job found for this estimator.")
            return None  # Indicate no data available

        # 2. Get the training job analytics
        if metrics_to_analyze:
            analyzer = TrainingJobAnalytics(
                training_job_name=training_job_name,
                metric_names=metrics_to_analyze
            )
        else:
            analyzer = TrainingJobAnalytics(training_job_name=training_job_name)

        df = analyzer.dataframe()

        # 3. Basic Data Inspection (Adapt as needed)
        print(f"Successfully retrieved metrics for job: {training_job_name}")
        print("First 5 rows of the data:")
        print(df.head())
        print("\nColumn information:")
        print(df.info())

        return df

    except Exception as e:
        print(f"Error analyzing training job: {e}")
        return None

# --- Example Usage with JumpStartEstimator ---

# 1. Instantiate and train your JumpStartEstimator (using your values)
# Get IAM role ARN
iam_client = boto3.client("iam")
role = iam_client.get_role(RoleName=os.getenv("ROLENAME"))
ROLE_ARN = role['Role']['Arn']

model_id = 'huggingface-llm-mixtral-8x7b'
model_version = '1.2.0'

# Retrieve default hyperparameters and then override
my_hyperparameters = hyperparameters.retrieve_default(model_id=model_id, model_version=model_version)
my_hyperparameters["epoch"] = "2"
my_hyperparameters["per_device_train_batch_size"] = "2"
my_hyperparameters["gradient_accumulation_steps"] = "2"
my_hyperparameters["instruction_tuned"] = "True"

instruction_tuned_estimator = JumpStartEstimator(
    model_id=model_id,
    model_version=model_version,
    hyperparameters=my_hyperparameters,
    instance_type="ml.g5.12xlarge",  # Or your desired instance type
    role=ROLE_ARN,
)

instruction_tuned_estimator.fit() # Train the estimator

# 2.  Optionally, specify metrics to analyze
metrics_of_interest = ['loss', 'accuracy', 'validation_loss']

# 3.  Analyze the training job metrics
metrics_df = analyze_jumpstart_training_job_metrics(instruction_tuned_estimator, metrics_of_interest)

# 4.  Further Analysis (example)
if metrics_df is not None:
    # Example: Calculate the average loss
    if 'loss' in metrics_df.columns:
        average_loss = metrics_df['loss'].mean()
        print(f"\nAverage loss: {average_loss}")

    # You can add more analysis here, like plotting, filtering, etc.

In [None]:
from sagemaker.analytics import TrainingJobAnalytics

# https://github.com/aws-samples/amazon-sagemaker-tuneranalytics-samples/blob/master/SageMaker-Tuning-Job-Analytics.ipynb

def job_metric(jobname, metric_name):
    jobdf = TrainingJobAnalytics(
        training_job_name=jobname,
        metric_names=[metric_name]).dataframe()

    jobdf['TrainingJobName'] = [jobname]*len(jobdf)

    return jobdf

print('Pick one of the following metrics for the next cell:')

#training_job_name = instruction_tuned_estimator.latest_training_job.job_name

#amplejob = TrainingJobAnalytics(training_job_name=results['TrainingJobName'][0])


#metrics = samplejob.dataframe()['metric_name'].unique()
#print('\n'.join([f'   - {metric}' for metric in metrics if metric != 'ObjectiveMetric']))

#Pick one of the following metrics for the next cell:
#   - validation:pixel_accuracy
#   - validation:throughput
#   - train:loss
#   - validation:mIOU
#   - train:throughput

In [None]:
import sagemaker
from sagemaker.analytics import TrainingJobAnalytics
import pandas as pd

def analyze_training_job_metrics(training_job_name, metrics_to_analyze=None):
    """
    Fetches and analyzes metrics from a SageMaker training job.

    Args:
        training_job_name (str): The name of the SageMaker training job.
        metrics_to_analyze (list, optional): A list of specific metric names to retrieve.
                                             If None, all available metrics are fetched.
                                             Defaults to None.

    Returns:
        pandas.DataFrame: A DataFrame containing the training job metrics, or None
                          if there's an error.
    """

    try:
        # 1.  Get the training job analytics
        if metrics_to_analyze:
            analyzer = TrainingJobAnalytics(
                training_job_name=training_job_name,
                metric_names=metrics_to_analyze
            )
        else:
            analyzer = TrainingJobAnalytics(training_job_name=training_job_name)

        df = analyzer.dataframe()

        # 2. Basic Data Inspection (Adapt as needed)
        print(f"Successfully retrieved metrics for job: {training_job_name}")
        print("First 5 rows of the data:")
        print(df.head())
        print("\nColumn information:")
        print(df.info())

        return df

    except Exception as e:
        print(f"Error analyzing training job '{training_job_name}': {e}")
        return None

# --- Example Usage with estimator ---

# Assuming you have a trained estimator object named 'instruction_tuned_estimator'
# 1.  Get the training job name from the estimator
try:
    job_name = instruction_tuned_estimator.latest_training_job.job_name
except AttributeError:
    print("Error: 'instruction_tuned_estimator' does not have the attribute 'latest_training_job'.")
    print("Please ensure your estimator object is correctly defined and has run a training job.")
    exit()

# 2.  Optionally, specify metrics to analyze
metrics_of_interest = ['loss', 'accuracy', 'validation_loss']

# 3.  Analyze the training job
metrics_df = analyze_training_job_metrics(job_name, metrics_of_interest)

# 4.  Further Analysis (example)
if metrics_df is not None:
    # Example: Calculate the average loss
    if 'loss' in metrics_df.columns:
        average_loss = metrics_df['loss'].mean()
        print(f"\nAverage loss: {average_loss}")

    # You can add more analysis here, like plotting, filtering, etc.

## FINE TUNING

In [3]:


iam_client = boto3.client("iam")

role = iam_client.get_role(
    RoleName=os.getenv("ROLENAME")
)

ROLE_ARN = role['Role']['Arn']

In [4]:
iam_client = boto3.client("iam")

role = iam_client.get_role(
    RoleName=os.getenv("ROLENAME")
)

ROLE_ARN = role['Role']['Arn']

In [5]:
sess = sagemaker.Session()
sagemaker_session_bucket=None
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

#print(f"sagemaker role arn: {role}")
#print(f"sagemaker bucket: {sess.default_bucket()}")
#print(f"sagemaker session region: {sess.boto_region_name}")

In [6]:
model_id = 'huggingface-llm-mixtral-8x7b'
model_version = '1.2.0'

In [7]:
%%time
from sagemaker.jumpstart.model import JumpStartModel

my_model = JumpStartModel(model_id=model_id, model_version=model_version, role=ROLE_ARN, region='us-east-1')

#my_model = JumpStartModel(model_id=model_id)
predictor = my_model.deploy()

Using vulnerable JumpStart model 'huggingface-llm-mixtral-8x7b' and version '1.2.0'.


-----------!CPU times: user 1.65 s, sys: 167 ms, total: 1.81 s
Wall time: 6min 4s


In [8]:
%%time

prompt = "Tell me about Amazon SageMaker."

payload = {
    "inputs": prompt,
    "parameters": {
        "do_sample": True,
        "top_p": 0.9,
        "temperature": 0.8,
        "max_new_tokens": 1024,
        "stop": ["<|endoftext|>", "</s>"]
    }
}

response = predictor.predict(payload)
print(response[0]["generated_text"])



Amazon SageMaker is a managed service for developers and data scientists to build, train and deploy machine learning models. It removes the heavy lifting from each step of the machine learning process to make it easier to develop high quality models. Amazon SageMaker enables developers and data scientists to easily connect to their data, use popular libraries to explore their data, train and tune models, and then seamlessly deploy these models with a single click.

Why should I consider using Amazon SageMaker?

Building a machine learning model today can be a complex process, requiring many different technologies and tools. Amazon SageMaker has machine learning capabilities built in and is designed to make it easier and faster to build machine learning models. Amazon SageMaker takes care of the heavy lifting required to build, train, and deploy your machine learning models, so you can focus on your business.

How does Amazon SageMaker help me build machine learning models?

Amazon Sa

In [9]:
def query_endpoint(payload):
    """Query endpoint and print the response"""
    response = predictor.predict(payload)
    print(f"\033[1m Input:\033[0m {payload['inputs']}")
    print(f"\033[1m Output:\033[0m {response[0]['generated_text']}")

In [10]:
# Code generation
payload = {"inputs": "Write a program to compute factorial in python:", "parameters":{"max_new_tokens": 200}}
query_endpoint(payload)

[1m Input:[0m Write a program to compute factorial in python:
[1m Output:[0m 

```
#!/usr/bin/python

def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)

print(factorial(5))
```

Output:

```
120
```

## Explanation

The factorial of a number is the product of all positive integers less than or equal to that number. For example, the factorial of 5 is 5 * 4 * 3 * 2 * 1, which is 120.

In this program, we define a function called factorial that takes a single argument n. If n is 0, we return 1. Otherwise, we return n multiplied by the factorial of n-1.

We then call the factorial function with the argument 5


In [11]:
payload = {
    "inputs": "Building a website can be done in 10 simple steps:",
    "parameters":{
        "max_new_tokens": 110,
        "no_repeat_ngram_size": 3
        }
}
query_endpoint(payload)

[1m Input:[0m Building a website can be done in 10 simple steps:
[1m Output:[0m 

1. Choose a domain name
2. Choose a web hosting company
3. Choose a website builder
4. Choose a template
5. Add your content
6. Add your images
7. Add your videos
8. Add your social media links
9. Add your contact information
10. Publish your website

## 1. Choose a domain name

The first step in building a website is to choose a domain name. This is the address that people will use to find your website. It should


In [12]:
# Translation
payload = {
    "inputs": """Translate English to French:

    sea otter => loutre de mer

    peppermint => menthe poivrée

    plush girafe => girafe peluche

    cheese =>""",
    "parameters":{
        "max_new_tokens": 3
    }
}

query_endpoint(payload)

[1m Input:[0m Translate English to French:

    sea otter => loutre de mer

    peppermint => menthe poivrée

    plush girafe => girafe peluche

    cheese =>
[1m Output:[0m  fromage



In [13]:
# Sentiment-analysis
payload = {
    "inputs": """"I hate it when my phone battery dies."
                Sentiment: Negative
                ###
                Tweet: "My day has been :+1:"
                Sentiment: Positive
                ###
                Tweet: "This is the link to the article"
                Sentiment: Neutral
                ###
                Tweet: "This new music video was incredibile"
                Sentiment:""",
    "parameters": {
        "max_new_tokens":2
    }
}
query_endpoint(payload)

[1m Input:[0m "I hate it when my phone battery dies."
                Sentiment: Negative
                ###
                Tweet: "My day has been :+1:"
                Sentiment: Positive
                ###
                Tweet: "This is the link to the article"
                Sentiment: Neutral
                ###
                Tweet: "This new music video was incredibile"
                Sentiment:
[1m Output:[0m  Positive


In [14]:
# Question answering
payload = {
    "inputs": "Could you remind me when was the C programming language invented?",
    "parameters":{
        "max_new_tokens": 50
    }
}
query_endpoint(payload)

[1m Input:[0m Could you remind me when was the C programming language invented?
[1m Output:[0m 

C was invented in 1972 by Dennis Ritchie at Bell Labs.

C is a general-purpose programming language. It was invented to write the UNIX operating system.

C is a structured


In [15]:
# Recipe generation
payload = {"inputs": "What is the recipe for a delicious lemon cheesecake?", "parameters":{"max_new_tokens": 400}}
query_endpoint(payload)

[1m Input:[0m What is the recipe for a delicious lemon cheesecake?
[1m Output:[0m 

- Instructions Preheat the oven to 350 degrees Fahrenheit (175 degrees C). In a medium mixing bowl, combine the graham cracker crumbs, 1/4 cup sugar, and melted butter. Mix thoroughly. Bake for 10 minutes in the preheated oven, or until the edges are barely beginning to brown. Allow to cool fully before serving.

## How do you make a cheesecake from scratch?

Ingredients

1. 1 1/2 cups graham cracker crumbs
2. 1/3 cup melted butter
3. 1/4 cup sugar
4. 3 (8 ounce) packages cream cheese
5. 1 cup sugar
6. 3 eggs
7. 1 cup sour cream
8. 1 teaspoon vanilla extract

## How do you make a cheesecake from scratch without a springform pan?

If you don’t have a springform pan, you may use a regular cake pan instead.

1. Prepare the cake pan by lining it with parchment paper.
2. Prepare the cake pan by lining it with parchment paper.
3. Prepare the cake pan by lining it with parchment paper.
4. Prepare the cake 

In [16]:
# Summarization

payload = {
    "inputs":"""Starting today, the state-of-the-art Falcon 40B foundation model from Technology
    Innovation Institute (TII) is available on Amazon SageMaker JumpStart, SageMaker's machine learning (ML) hub
    that offers pre-trained models, built-in algorithms, and pre-built solution templates to help you quickly get
    started with ML. You can deploy and use this Falcon LLM with a few clicks in SageMaker Studio or
    programmatically through the SageMaker Python SDK.
    Falcon 40B is a 40-billion-parameter large language model (LLM) available under the Apache 2.0 license that
    ranked #1 in Hugging Face Open LLM leaderboard, which tracks, ranks, and evaluates LLMs across multiple
    benchmarks to identify top performing models. Since its release in May 2023, Falcon 40B has demonstrated
    exceptional performance without specialized fine-tuning. To make it easier for customers to access this
    state-of-the-art model, AWS has made Falcon 40B available to customers via Amazon SageMaker JumpStart.
    Now customers can quickly and easily deploy their own Falcon 40B model and customize it to fit their specific
    needs for applications such as translation, question answering, and summarizing information.
    Falcon 40B are generally available today through Amazon SageMaker JumpStart in US East (Ohio),
    US East (N. Virginia), US West (Oregon), Asia Pacific (Tokyo), Asia Pacific (Seoul), Asia Pacific (Mumbai),
    Europe (London), Europe (Frankfurt), Europe (Ireland), and Canada (Central),
    with availability in additional AWS Regions coming soon. To learn how to use this new feature,
    please see SageMaker JumpStart documentation, the Introduction to SageMaker JumpStart –
    Text Generation with Falcon LLMs example notebook, and the blog Technology Innovation Institute trainsthe
    state-of-the-art Falcon LLM 40B foundation model on Amazon SageMaker. Summarize the article above:""",
    "parameters":{
        "max_new_tokens":200
        }
    }
query_endpoint(payload)

[1m Input:[0m Starting today, the state-of-the-art Falcon 40B foundation model from Technology
    Innovation Institute (TII) is available on Amazon SageMaker JumpStart, SageMaker's machine learning (ML) hub
    that offers pre-trained models, built-in algorithms, and pre-built solution templates to help you quickly get
    started with ML. You can deploy and use this Falcon LLM with a few clicks in SageMaker Studio or
    programmatically through the SageMaker Python SDK.
    Falcon 40B is a 40-billion-parameter large language model (LLM) available under the Apache 2.0 license that
    ranked #1 in Hugging Face Open LLM leaderboard, which tracks, ranks, and evaluates LLMs across multiple
    benchmarks to identify top performing models. Since its release in May 2023, Falcon 40B has demonstrated
    exceptional performance without specialized fine-tuning. To make it easier for customers to access this
    state-of-the-art model, AWS has made Falcon 40B available to customers via Amaz

### Preparing training data
We will use a subset of SQuAD2.0 for supervised fine-tuning. This dataset contains questions posed by human annotators on a set of Wikipedia articles. In addition to questions with answers, SQuAD2.0 contains about 50k unanswerable questions. Such questions are plausible, but cannot be directly answered from the articles' content. We only use unanswerable questions for our task.

Citation: @article{rajpurkar2018know, title={Know what you don't know: Unanswerable questions for SQuAD}, author={Rajpurkar, Pranav and Jia, Robin and Liang, Percy}, journal={arXiv preprint arXiv:1806.03822}, year={2018} }

License: Creative Commons Attribution-ShareAlike License (CC BY-SA 4.0)

In [17]:
import colab_env
import boto3
import os
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel

iam = boto3.client("iam")

role = iam.get_role(
    RoleName=os.getenv("ROLENAME")
)

ROLE_ARN = role['Role']['Arn']

In [18]:
sess = sagemaker.Session()
sagemaker_session_bucket=None
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: {'Role': {'Path': '/', 'RoleName': 'POC_GAI', 'RoleId': 'AROAVXV6JI27GVP4V6HF2', 'Arn': 'arn:aws:iam::394462316222:role/POC_GAI', 'CreateDate': datetime.datetime(2023, 12, 10, 22, 15, 54, tzinfo=tzlocal()), 'AssumeRolePolicyDocument': {'Version': '2012-10-17', 'Statement': [{'Sid': '', 'Effect': 'Allow', 'Principal': {'Service': 'sagemaker.amazonaws.com'}, 'Action': 'sts:AssumeRole'}]}, 'Description': 'Allows SageMaker notebook instances, training jobs, and models to access S3, ECR, and CloudWatch on your behalf.', 'MaxSessionDuration': 3600, 'RoleLastUsed': {'LastUsedDate': datetime.datetime(2025, 4, 7, 11, 33, 23, tzinfo=tzlocal()), 'Region': 'us-east-1'}}, 'ResponseMetadata': {'RequestId': '259a383b-b665-4bf2-b658-af1c079d829a', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 07 Apr 2025 11:38:03 GMT', 'x-amzn-requestid': '259a383b-b665-4bf2-b658-af1c079d829a', 'content-type': 'text/xml', 'content-length': '1087'}, 'RetryAttempts': 0}}
sagemaker bucket: sage

In [19]:
import boto3
import sagemaker
import json

# Get current region, role, and default bucket
aws_region = boto3.Session().region_name
aws_role = sagemaker.session.Session().get_caller_identity_arn()
output_bucket = sagemaker.Session().default_bucket()

# This will be useful for printing
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

print(f"{bold}aws_region:{unbold} {aws_region}")
print(f"{bold}aws_role:{unbold} {aws_role}")
print(f"{bold}output_bucket:{unbold} {output_bucket}")



[1maws_region:[0m us-east-1
[1maws_role:[0m arn:aws:iam::394462316222:root
[1moutput_bucket:[0m sagemaker-us-east-1-394462316222


In [20]:
!ls -alh /content/gdrive/MyDrive/datasets/train-v2.0.json

-rw------- 1 root root 41M Mar  6  2024 /content/gdrive/MyDrive/datasets/train-v2.0.json


In [21]:
from sagemaker.s3 import S3Downloader

# /content/gdrive/MyDrive/datasets/train-v2.0.json
# We will use the train split of SQuAD2.0
original_data_file = "train-v2.0.json"


# The data was mirrored in the following bucket
original_data_location = (
    f"s3://{sess.default_bucket()}/{original_data_file}"
)

print(original_data_location)


S3Downloader.download(original_data_location, ".")

s3://sagemaker-us-east-1-394462316222/train-v2.0.json


['./train-v2.0.json']

In [22]:
template = {
    "prompt": "Ask a question which is related to the following text, but cannot be answered based on the text. Text: {context}",
    "completion": "{question}",
}

with open("template.json", "w") as f:
    json.dump(template, f)

In [23]:
local_data_file = "task-data.jsonl"  # any name with .jsonl extension

with open(original_data_file) as f:
    data = json.load(f)

def preprocess_data(local_data_file, data, num_maximum_example):
    num_example_idx = 0
    with open(local_data_file, "w") as f:
        for article in data["data"]:
            for paragraph in article["paragraphs"]:
                # iterate over questions for a given paragraph
                for qas in paragraph["qas"]:
                    if qas["is_impossible"]:
                        # the question is relevant, but cannot be answered
                        example = {"context": paragraph["context"], "question": qas["question"]}
                        json.dump(example, f)
                        f.write("\n")
                        num_example_idx += 1
                        if num_example_idx >= num_maximum_example:
                            return

preprocess_data(local_data_file=local_data_file, data=data, num_maximum_example=10000)

In [24]:
from sagemaker.s3 import S3Uploader

training_dataset_s3_path = f"s3://{output_bucket}/train_data"
S3Uploader.upload(local_data_file, training_dataset_s3_path)
S3Uploader.upload("template.json", training_dataset_s3_path)
print(f"{bold}training data:{unbold} {training_dataset_s3_path}")

[1mtraining data:[0m s3://sagemaker-us-east-1-394462316222/train_data


### Prepare training parameters

In [25]:
from sagemaker import hyperparameters


model_id = 'huggingface-llm-mixtral-8x7b'
model_version = '*'

my_hyperparameters = hyperparameters.retrieve_default(model_id=model_id, model_version=model_version)
print(my_hyperparameters)

Using model 'huggingface-llm-mixtral-8x7b' with wildcard version identifier '*'. You can pin to version '1.18.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


{'peft_type': 'lora', 'instruction_tuned': 'False', 'chat_dataset': 'False', 'epoch': '3', 'learning_rate': '0.0001', 'lora_r': '64', 'lora_alpha': '16', 'lora_dropout': '0', 'bits': '4', 'double_quant': 'True', 'quant_type': 'nf4', 'per_device_train_batch_size': '2', 'per_device_eval_batch_size': '8', 'add_input_output_demarcation_key': 'True', 'warmup_ratio': '0.1', 'train_from_scratch': 'False', 'fp16': 'False', 'bf16': 'True', 'evaluation_strategy': 'steps', 'eval_steps': '20', 'gradient_accumulation_steps': '8', 'logging_steps': '8', 'weight_decay': '0.2', 'load_best_model_at_end': 'True', 'max_train_samples': '-1', 'max_val_samples': '-1', 'seed': '10', 'max_input_length': '-1', 'validation_split_ratio': '0.2', 'train_data_split_seed': '0', 'preprocessing_num_workers': 'None', 'max_steps': '-1', 'gradient_checkpointing': 'False', 'early_stopping_patience': '3', 'early_stopping_threshold': '0.0', 'adam_beta1': '0.9', 'adam_beta2': '0.999', 'adam_epsilon': '1e-08', 'max_grad_norm':

Overwrite the hyperparameters

In [26]:
my_hyperparameters["epoch"] = "2"
my_hyperparameters["per_device_train_batch_size"] = "2"
my_hyperparameters["gradient_accumulation_steps"] = "2"
my_hyperparameters["instruction_tuned"] = "True"
print(my_hyperparameters)

{'peft_type': 'lora', 'instruction_tuned': 'True', 'chat_dataset': 'False', 'epoch': '2', 'learning_rate': '0.0001', 'lora_r': '64', 'lora_alpha': '16', 'lora_dropout': '0', 'bits': '4', 'double_quant': 'True', 'quant_type': 'nf4', 'per_device_train_batch_size': '2', 'per_device_eval_batch_size': '8', 'add_input_output_demarcation_key': 'True', 'warmup_ratio': '0.1', 'train_from_scratch': 'False', 'fp16': 'False', 'bf16': 'True', 'evaluation_strategy': 'steps', 'eval_steps': '20', 'gradient_accumulation_steps': '2', 'logging_steps': '8', 'weight_decay': '0.2', 'load_best_model_at_end': 'True', 'max_train_samples': '-1', 'max_val_samples': '-1', 'seed': '10', 'max_input_length': '-1', 'validation_split_ratio': '0.2', 'train_data_split_seed': '0', 'preprocessing_num_workers': 'None', 'max_steps': '-1', 'gradient_checkpointing': 'False', 'early_stopping_patience': '3', 'early_stopping_threshold': '0.0', 'adam_beta1': '0.9', 'adam_beta2': '0.999', 'adam_epsilon': '1e-08', 'max_grad_norm': 

Validate hyperparameters

In [27]:
hyperparameters.validate(model_id=model_id, model_version=model_version, hyperparameters=my_hyperparameters)

### Starting training

Note. The parameter `load_best_model_at_end` (Whether or not to load the best model found during training at the end of training. When this option is enabled, the best checkpoint will always be saved) is set as "True" by default. During loading the best model checkpoints at the end of training (HuggingFace will load the best model checkpoints before saving it), there is overhead of memory usage which can lead to Out-Of-Memory error.

If setting `load_best_model_at_end`, we recommend to use `ml.g5.48xlarge`; if not, we recommend to use `ml.g5.12xlarge`.

In [None]:
from sagemaker.jumpstart.estimator import JumpStartEstimator

instruction_tuned_estimator = JumpStartEstimator(
    model_id=model_id,
    hyperparameters=my_hyperparameters,
    #instance_type="ml.g5.48xlarge",
    instance_type="ml.g5.12xlarge",
    role=ROLE_ARN,
)
instruction_tuned_estimator.fit(
    {"train": training_dataset_s3_path}, logs=True
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[34m18%|█▊        | 45/250 [02:33<12:04,  3.53s/it]#033[A[0m
[34m18%|█▊        | 46/250 [02:36<11:18,  3.33s/it]#033[A[0m
[34m19%|█▉        | 47/250 [02:40<11:54,  3.52s/it]#033[A[0m
[34m19%|█▉        | 48/250 [02:43<11:43,  3.48s/it]#033[A[0m
[34m20%|█▉        | 49/250 [02:48<12:28,  3.72s/it]#033[A[0m
[34m20%|██        | 50/250 [02:51<12:02,  3.61s/it]#033[A[0m
[34m20%|██        | 51/250 [02:54<11:35,  3.49s/it]#033[A[0m
[34m21%|██        | 52/250 [02:58<11:56,  3.62s/it]#033[A[0m
[34m21%|██        | 53/250 [03:02<12:11,  3.71s/it]#033[A[0m
[34m22%|██▏       | 54/250 [03:06<12:09,  3.72s/it]#033[A[0m
[34m22%|██▏       | 55/250 [03:09<11:20,  3.49s/it]#033[A[0m
[34m22%|██▏       | 56/250 [03:13<11:43,  3.62s/it]#033[A[0m
[34m23%|██▎       | 57/250 [03:15<10:46,  3.35s/it]#033[A[0m
[34m23%|██▎       | 58/250 [03:19<10:46,  3.37s/it]#033[A[0m
[34m24%|██▎       | 59/250 [03:22<10:25,  3.28s/it]#

Extract Training performance metrics. Performance metrics such as training loss and validation accuracy/loss can be accessed through cloudwatch while the training. We can also fetch these metrics and analyze them within the notebook.

In [None]:
from sagemaker import TrainingJobAnalytics

training_job_name = instruction_tuned_estimator.latest_training_job.job_name

df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
df.head(10)

### Deploying inference endpoints

In [None]:
instruction_tuned_predictor = instruction_tuned_estimator.deploy()

### Running inference queries and compare model performances

We examine three examples as listed in variable `test_paragraphs`. The prompt as defined in variable `prompt` asks the model to ask a question based on the context and make sure the question **cannot** be answered from the context.


In [None]:
prompt = "Ask a question which is related to the following text, but cannot be answered based on the text. Text: {context}"

# Sources: Wikipedia, AWS Documentation
test_paragraphs = [
    """
Adelaide is the capital city of South Australia, the state's largest city and the fifth-most populous city in Australia. "Adelaide" may refer to either Greater Adelaide (including the Adelaide Hills) or the Adelaide city centre. The demonym Adelaidean is used to denote the city and the residents of Adelaide. The Traditional Owners of the Adelaide region are the Kaurna people. The area of the city centre and surrounding parklands is called Tarndanya in the Kaurna language.
Adelaide is situated on the Adelaide Plains north of the Fleurieu Peninsula, between the Gulf St Vincent in the west and the Mount Lofty Ranges in the east. Its metropolitan area extends 20 km (12 mi) from the coast to the foothills of the Mount Lofty Ranges, and stretches 96 km (60 mi) from Gawler in the north to Sellicks Beach in the south.
""",
    """
Amazon Elastic Block Store (Amazon EBS) provides block level storage volumes for use with EC2 instances. EBS volumes behave like raw, unformatted block devices. You can mount these volumes as devices on your instances. EBS volumes that are attached to an instance are exposed as storage volumes that persist independently from the life of the instance. You can create a file system on top of these volumes, or use them in any way you would use a block device (such as a hard drive). You can dynamically change the configuration of a volume attached to an instance.
We recommend Amazon EBS for data that must be quickly accessible and requires long-term persistence. EBS volumes are particularly well-suited for use as the primary storage for file systems, databases, or for any applications that require fine granular updates and access to raw, unformatted, block-level storage. Amazon EBS is well suited to both database-style applications that rely on random reads and writes, and to throughput-intensive applications that perform long, continuous reads and writes.
""",
    """
Amazon Comprehend uses natural language processing (NLP) to extract insights about the content of documents. It develops insights by recognizing the entities, key phrases, language, sentiments, and other common elements in a document. Use Amazon Comprehend to create new products based on understanding the structure of documents. For example, using Amazon Comprehend you can search social networking feeds for mentions of products or scan an entire document repository for key phrases.
You can access Amazon Comprehend document analysis capabilities using the Amazon Comprehend console or using the Amazon Comprehend APIs. You can run real-time analysis for small workloads or you can start asynchronous analysis jobs for large document sets. You can use the pre-trained models that Amazon Comprehend provides, or you can train your own custom models for classification and entity recognition.
All of the Amazon Comprehend features accept UTF-8 text documents as the input. In addition, custom classification and custom entity recognition accept image files, PDF files, and Word files as input.
Amazon Comprehend can examine and analyze documents in a variety of languages, depending on the specific feature. For more information, see Languages supported in Amazon Comprehend. Amazon Comprehend's Dominant language capability can examine documents and determine the dominant language for a far wider selection of languages.
""",
]

In [None]:
parameters = {
    "max_new_tokens": 50,
    "do_sample": True,
    "top_k": 50,
    "top_p": 0.8,
    "do_sample": True,
    "temperature": 0.01,
}

def query_endpoint_with_json_payload(encoded_json, endpoint_name):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType="application/json", Body=encoded_json
    )
    return response

def parse_response(query_response):
    model_predictions = json.loads(query_response["Body"].read())
    return model_predictions[0]["generated_text"]

def generate_question(endpoint_name, text):
    expanded_prompt = prompt.replace("{context}", text)
    payload = {"inputs": expanded_prompt, "parameters": parameters}
    query_response = query_endpoint_with_json_payload(json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name)
    generated_texts = parse_response(query_response)
    print(f"Response: {generated_texts}{newline}")

In [None]:
print(f"{bold}Prompt:{unbold} {repr(prompt)}")
for paragraph in test_paragraphs:
    print("-" * 80)
    print(paragraph)
    print("-" * 80)
    print(f"{bold}pre-trained{unbold}")
    generate_question(predictor.endpoint_name, paragraph)
    print(f"{bold}fine-tuned{unbold}")
    generate_question(instruction_tuned_predictor.endpoint_name, paragraph)

### Clean up the endpoint

In [None]:
#  Frank Morales created this cell on December 14, 2023; it fully allows automatically the deletion of endpoints, models, and endpoint configurations.

import colab_env
import os

aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region=os.getenv("AWS_DEFAULT_REGION")
aws_output=os.getenv("AWS_DEFAULT_OUTPUT")

import boto3

sagemaker_client = boto3.client('sagemaker', region_name=aws_region)

def cleanup_sagemaker_resources(resource_name,resourceid):

    if resourceid==0:
       response=sagemaker_client.list_endpoints()
    elif resourceid==1:
         response=sagemaker_client.list_models()
    elif resourceid==2:
         response=sagemaker_client.list_endpoint_configs()

    print(resource_name)

    number_of_endpoints=len(response['%s'%resource_name])
    for i in range(number_of_endpoints):
        resource_nametmp='%s'%resource_name[0:len(resource_name)-1]
        print('%sName'%resource_nametmp)
        print(response['%s'%resource_name][i]['%sName'%resource_nametmp])

        if resourceid==0:
           endpoint_name=response['%s'%resource_name][i]['%sName'%resource_nametmp]
           sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
        elif resourceid==1:
           sagemaker_client.delete_model(ModelName=response['Models'][i]['ModelName'])
        elif resourceid==2:
           sagemaker_client.delete_endpoint_config(EndpointConfigName=response['EndpointConfigs'][i]['EndpointConfigName'])

    print("\n==================================\n")


cleanup_sagemaker_resources('Endpoints',0)
cleanup_sagemaker_resources('Models',1)
cleanup_sagemaker_resources('EndpointConfigs',2)

In [None]:
# Delete the SageMaker endpoint
predictor.delete_model()
predictor.delete_endpoint()
instruction_tuned_predictor.delete_model()
instruction_tuned_predictor.delete_endpoint()