## Personalized Marketing Evalation Example

In [None]:
# Prerequisite A. Create a conda/ pyython virtual environment
# !conda env list
# !conda create -y --name marketing-msg-eval python=3.11.8 
# !conda init && activate marketing-msg-eval
# !conda install -n marketing-msg-eval ipykernel --update-deps --force-reinstall -y # if this fails, try running steps in terminal

In [2]:
# Prerequisite B. Install dependencies
# %pip install --force-reinstall --no-cache -r requirements.txt

Collecting boto3 (from -r requirements.txt (line 1))
  Downloading boto3-1.34.154-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore (from -r requirements.txt (line 2))
  Downloading botocore-1.34.154-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain (from -r requirements.txt (line 3))
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting transformers (from -r requirements.txt (line 4))
  Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)
Collecting ipywidgets (from -r requirements.txt (line 5))
  Downloading ipywidgets-8.1.3-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community (from -r requirements.txt (line 6))
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting anthropic (from -r requirements.txt (line 7))
  Downloading anthropic-0.32.0-py3-none-any.whl.metadata (18 kB)
Collecting pillow<10,>=9.5 (from -r requirements.txt (line 8))
  Downloading Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86

In [10]:
# Prerequisite C. Set environment variables
from dotenv import load_dotenv, find_dotenv
import os

# loading environment variables that are stored in local file dev.env
local_env_filename = 'marketing-msg-eval.env'
load_dotenv(find_dotenv(local_env_filename),override=True)

os.environ['REGION'] = os.getenv('REGION')
os.environ['S3_BUCKET_NAME'] = os.getenv('S3_BUCKET_NAME')
os.environ['WORKTEAM_ARN'] = os.getenv('WORKTEAM_ARN')
os.environ['SAGEMAKER_ROLE_ARN'] = os.getenv('SAGEMAKER_ROLE_ARN')

REGION = os.environ['REGION']
S3_BUCKET_NAME = os.environ['S3_BUCKET_NAME']
WORKTEAM_ARN = os.environ['WORKTEAM_ARN']
SAGEMAKER_ROLE_ARN = os.environ['SAGEMAKER_ROLE_ARN']


In [6]:
# 1. Generate test data
# 1a. Create synthetic data
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Define the list of first names, last names, and languages
first_names = ['John', 'Sarah', 'Michael', 'Emily', 'David', 'Jessica', 'William', 'Olivia', 'Christopher', 'Sophia']
last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Davis', 'Miller', 'Wilson', 'Anderson', 'Taylor', 'Thompson']
languages = ['English', 'Spanish', 'French', 'German']

# Generate the data
data = []
for i in range(100):
    user_id = f'user_{i+1}'
    first_name = random.choice(first_names)
    last_name = random.choice(last_names)
    language = random.choice(languages)
    age = random.randint(18, 65)
    date = datetime(2023, 1, 1) + timedelta(days=random.randint(0, 364))
    season = 'Spring' if date.month in [3, 4, 5] else ('Summer' if date.month in [6, 7, 8] else ('Fall' if date.month in [9, 10, 11] else 'Winter'))
    app_name = f'App_{random.randint(1, 5)}'
    app_installed = random.choice([True, False])
    app_opened_last = random.randint(0, 30) if app_installed else 0
    app_bonus_time_remaining = random.randint(0, 10) if app_installed else 0
    discount_for_member = random.choice([0.1, 0.15, 0.2]) if app_installed else 0

    data.append([user_id, first_name, last_name, language, age, date, season, app_name, app_installed, app_opened_last, app_bonus_time_remaining, discount_for_member])

# Create the dataframe
df = pd.DataFrame(data, columns=['userid', 'first_name', 'last_name', 'language', 'age', 'date', 'season', 'app_name', 'app_installed', 'app_opened_last', 'app_bonus_time_remaining', 'discount_for_member'])

print(df.head())

   userid first_name last_name language  age       date  season app_name  \
0  user_1      David    Miller  Spanish   22 2023-07-14  Summer    App_3   
1  user_2    William   Johnson   German   57 2023-05-16  Spring    App_5   
2  user_3    Jessica     Smith  Spanish   44 2023-08-08  Summer    App_2   
3  user_4       John    Wilson  Spanish   62 2023-01-07  Winter    App_2   
4  user_5     Sophia  Williams   German   63 2023-09-16    Fall    App_1   

   app_installed  app_opened_last  app_bonus_time_remaining  \
0           True               10                         7   
1          False                0                         0   
2          False                0                         0   
3           True               12                         1   
4           True               19                         4   

   discount_for_member  
0                  0.1  
1                  0.0  
2                  0.0  
3                  0.2  
4                  0.2  


In [7]:
# 1b. Create promptset for above sample data and save it to jsonl file following the below format
# {
#     "prompt": "What is high intensity interval training?",
#     "category": "Fitness",
#     "referenceResponse": "High-Intensity Interval Training (HIIT) is a cardiovascular exercise approach that involves short, intense bursts of exercise followed by brief recovery or rest periods."
# }
import json
from datetime import datetime


# sample prompt template for Claude
claude_prompt_template = '''Human: I want to promote app {app_name}, during {season} to the user {first_name} {last_name} who is {age} years old.
The discount is {discount_for_member}. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available.
The value for app_installed is {app_installed} . If the value is False, then include the discount in the push notification if if the discount is greater than 0.0.
The value for app_opened_last is {app_opened_last}. If the value is greater than 7, then urge the user to use the app. 
If {app_bonus_time_remaining} is greater than 0, then include in the message the {app_bonus_time_remaining} hours remaining for an additional bonus.
Include 1 to 3 emoji's in the message that reflect the age of the person ({age} years old) and {season} season.
Please help to write a 1-2 short sentences in {language} to help me to promote the app {app_name} to this customer. 
Only return the sentence and nothing else.

Assistant: '''

# sample prompt template for Titan
titan_lite_prompt_template = '''Please promote app {app_name}, during {season} to the user {first_name} {last_name} who is {age} years old.
The discount is {discount_for_member}. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available.
The value for app_installed is {app_installed} . If the value is False, then include the discount in the push notification message if the discount is greater than 0.0.
The value for app_opened_last is {app_opened_last}. If the value is greater than 7, then urge the user to use the app.
If {app_bonus_time_remaining} is greater than 0, then include in the message the {app_bonus_time_remaining} hours remaining for an additional bonus.
Include 1 to 3 emoji's in the message that reflect the age of the person ({age} years old) and {season} season.
Write 1 sentence in {language} to promote the app {app_name} to this customer.
Only return the sentence and nothing else.
'''

# sample prompt template for Mistral
mistral_prompt_template = '''<s>[INST] Please promote app {app_name}, during {season} to the user {first_name} {last_name} who is {age} years old.
The discount is {discount_for_member}. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available.
The value for app_installed is {app_installed} . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0.
The value for app_opened_last is {app_opened_last}. If the value is greater than 7, then urge the user to use the app.
If {app_bonus_time_remaining} is greater than 0, then include in the message the {app_bonus_time_remaining} hours remaining for an additional bonus.
Include 1 to 3 emoji's in the message that reflect the age of the person ({age} years old) and {season} season.
Write 1 to 2 short sentences in {language} to promote the app {app_name} to this customer. 
Only return the sentence and nothing else. [/INST]
'''

promptrows = []

for index, row in df.iterrows():
    claude_prompt = claude_prompt_template.format(app_name=row["app_name"], season=row["season"], first_name=row["first_name"], last_name=row["last_name"], 
                        age=row["age"], discount_for_member=row["discount_for_member"], app_installed=row["app_installed"],
                        app_opened_last=row["app_opened_last"], language=row["language"], app_bonus_time_remaining=row["app_bonus_time_remaining"])

    titan_prompt = titan_lite_prompt_template.format(app_name=row["app_name"], season=row["season"], first_name=row["first_name"], last_name=row["last_name"], 
                        age=row["age"], discount_for_member=row["discount_for_member"], app_installed=row["app_installed"],
                        app_opened_last=row["app_opened_last"], language=row["language"], app_bonus_time_remaining=row["app_bonus_time_remaining"])
    
    mistral_prompt = mistral_prompt_template.format(app_name=row["app_name"], season=row["season"], first_name=row["first_name"], last_name=row["last_name"], 
                        age=row["age"], discount_for_member=row["discount_for_member"], app_installed=row["app_installed"],
                        app_opened_last=row["app_opened_last"], language=row["language"], app_bonus_time_remaining=row["app_bonus_time_remaining"])

    # Convert referenceResponse to a text string
    reference_response = (
        f"{row['first_name']} {row['last_name']} is a {row['age']} years old user of the app {row['app_name']}. "
        f"They are interested in promotions during the {row['season']} season. The user has a discount of "
        f"{row['discount_for_member'] * 100}% for members. The app is {'installed' if row['app_installed'] else 'not installed'}, "
        f"and they last opened the app {row['app_opened_last']} days ago. There are {row['app_bonus_time_remaining']} hours remaining "
        f"for an additional bonus. Communication should be in {row['language']}."
    )

    # use mistral prompt template example  for model evaluation
    promptrows.append({"prompt":mistral_prompt, "category":"Marketing", "referenceResponse": reference_response, "claude_prompt": claude_prompt, "titan_prompt": titan_prompt, "mistral_prompt": mistral_prompt })

print(promptrows)
# File path for the JSONL file
jsonl_file_path = 'data.jsonl'

# Custom function to handle datetime serialization
def datetime_handler(x):
    if isinstance(x, datetime):
        return x.isoformat()
    raise TypeError("Type not serializable")

# Save to JSONL file
with open(jsonl_file_path, 'w') as jsonl_file:
    for item in promptrows:
        jsonl_file.write(json.dumps(item, default=datetime_handler) + '\n')
    

[{'prompt': "<s>[INST] Please promote app App_3, during Summer to the user David Miller who is 22 years old.\nThe discount is 0.1. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available.\nThe value for app_installed is True . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0.\nThe value for app_opened_last is 10. If the value is greater than 7, then urge the user to use the app.\nIf 7 is greater than 0, then include in the message the 7 hours remaining for an additional bonus.\nInclude 1 to 3 emoji's in the message that reflect the age of the person (22 years old) and Summer season.\nWrite 1 to 2 short sentences in Spanish to promote the app App_3 to this customer. \nOnly return the sentence and nothing else. [/INST]\n", 'category': 'Marketing', 'referenceResponse': 'David Miller is a 22 years old user of the app App_3. They are interested in promotions during the Summer 

In [8]:
# 1c. Upload to S3
import boto3
s3_bucket_name = S3_BUCKET_NAME
jsonl_file_path = 'data.jsonl'
object_key = 'custom-datasets/data.jsonl'
region = REGION
# Create an S3 client
s3_client = boto3.client('s3')

# Upload the file
try:
    s3_client.upload_file(jsonl_file_path, s3_bucket_name, object_key)
    print(f"File {jsonl_file_path} uploaded to s3://{s3_bucket_name}/{object_key}")
except Exception as e:
    print(f"Error uploading file: {e}")

File data.jsonl uploaded to s3://marketing-gen-eval/custom-datasets/data.jsonl


In [11]:
# 2a. LLM Bedrock EVAL (only good for Toxicity) 
import boto3
client = boto3.client('bedrock')
s3_bucket_name = S3_BUCKET_NAME
region = REGION
jobName = 'mistralevalv2'
roleArn = SAGEMAKER_ROLE_ARN

job_request = client.create_evaluation_job(
    jobName=jobName,
    jobDescription="evaluate marketing text",
    roleArn=roleArn,
    inferenceConfig={
        "models": [
            {
                "bedrockModel": {
                    "modelIdentifier":f"arn:aws:bedrock:{region}::foundation-model/mistral.mixtral-8x7b-instruct-v0:1",
                    "inferenceParams":"{\"temperature\":\"0.0\", \"topP\":\"1\", \"maxTokenCount\":\"512\"}"
                }

            }
        ]

    },
    outputDataConfig={
        "s3Uri":f"s3://{s3_bucket_name}/outputs/"
    },
    evaluationConfig={
        "automated": {
            "datasetMetricConfigs": [
                {
                    "taskType": "Generation",
                    "dataset": {
                        "name": "Custom_Dataset1",
                        "datasetLocation": {
                            "s3Uri": f"s3://{s3_bucket_name}/custom-datasets/data.jsonl"
                        }
                    },
                    "metricNames": [
                        # "Builtin.Accuracy", # not useful due to lack of groundtruth
                        # "Builtin.Robustness", # not useful due to lack of groundtruth
                        "Builtin.Toxicity",
                    ]
                }
            ]
        }
    }
)

print(job_request)

{'ResponseMetadata': {'RequestId': '18adcb11-f856-4f2d-b21d-cd2161bf8f5c', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Tue, 06 Aug 2024 00:37:30 GMT', 'content-type': 'application/json', 'content-length': '79', 'connection': 'keep-alive', 'x-amzn-requestid': '18adcb11-f856-4f2d-b21d-cd2161bf8f5c'}, 'RetryAttempts': 0}, 'jobArn': 'arn:aws:bedrock:us-east-1:026459568683:evaluation-job/w7tn3hrvp7y4'}


In [24]:
# 2b. Check if eval job with the name jobName is complete. If yes download from s3://{s3_bucket_name}/outputs/ the results and load them into a dataframe
from datetime import datetime
import pandas as pd
import boto3
# Create an S3 client
s3 = boto3.client('s3')

response = client.list_evaluation_jobs(
    creationTimeAfter=datetime(2015, 1, 1),
    # creationTimeBefore=datetime(2015, 1, 1),
    statusEquals= 'Completed', #'InProgress'|'Completed'|'Failed'|'Stopping'|'Stopped',
    nameContains=jobName,
    maxResults=123,
    # nextToken='string',
    sortBy='CreationTime',
    sortOrder='Descending'
)

if response['jobSummaries']:
    job_summary = response['jobSummaries'][0]
    if job_summary['jobName'] == jobName:
        print(f'job_summary: {job_summary}')
        file_key = f'outputs/{jobName}/w7tn3hrvp7y4/models/mistral.mixtral-8x7b-instruct-v0:1/taskTypes/Generation/datasets/Custom_Dataset1/f8a63f91-3a22-45e6-a84a-3964f830ad13_output.jsonl'
        
        # Read the JSONL file from S3 into a list of dictionaries
        response = s3.get_object(Bucket=s3_bucket_name, Key=file_key)
        data = response['Body'].read().decode('utf-8').split('\n')
        json_data = [json.loads(line) for line in data if line]

        df = pd.DataFrame(json_data)
        print(f"Loaded results of job {jobName} into a DataFrame.")

        from IPython.display import display, HTML

        # Convert the dataframe to an HTML table
        table_html = df.to_html(index=False, classes='table table-striped')

        # Display the HTML table
        display(HTML(table_html))

    else:
        print(f"Job {jobName} is not complete yet.")


job_summary: {'jobArn': 'arn:aws:bedrock:us-east-1:026459568683:evaluation-job/w7tn3hrvp7y4', 'jobName': 'mistralevalv2', 'status': 'Completed', 'creationTime': datetime.datetime(2024, 8, 6, 0, 37, 30, 304000, tzinfo=tzlocal()), 'jobType': 'Automated', 'evaluationTaskTypes': ['Generation'], 'modelIdentifiers': ['arn:aws:bedrock:us-east-1::foundation-model/mistral.mixtral-8x7b-instruct-v0:1']}
Loaded results of job mistralevalv2 into a DataFrame.


automatedEvaluationResult,inputRecord,modelResponses
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00165}]}","{'prompt': '<s>[INST] Please promote app App_3, during Summer to the user David Miller who is 22 years old. The discount is 0.1. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is True . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 10. If the value is greater than 7, then urge the user to use the app. If 7 is greater than 0, then include in the message the 7 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (22 years old) and Summer season. Write 1 to 2 short sentences in Spanish to promote the app App_3 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'David Miller is a 22 years old user of the app App_3. They are interested in promotions during the Summer season. The user has a discount of 10.0% for members. The app is installed, and they last opened the app 10 days ago. There are 7 hours remaining for an additional bonus. Communication should be in Spanish.'}","[{'response': ' Hey David! Your favorite App_3 is here with a sweet summer offer of 10% off!☀️ Hurry, only 7 hours left for an extra bonus. Use it or lose it! Andale, ¡aprovéchalo ya! #App3SummerFun 🌴🕶️🏖️', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.000863}]}","{'prompt': '<s>[INST] Please promote app App_5, during Spring to the user William Johnson who is 57 years old. The discount is 0.0. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is False . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 0. If the value is greater than 7, then urge the user to use the app. If 0 is greater than 0, then include in the message the 0 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (57 years old) and Spring season. Write 1 to 2 short sentences in German to promote the app App_5 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'William Johnson is a 57 years old user of the app App_5. They are interested in promotions during the Spring season. The user has a discount of 0.0% for members. The app is not installed, and they last opened the app 0 days ago. There are 0 hours remaining for an additional bonus. Communication should be in German.'}","[{'response': ' ""Schönen Frühling, William! Entdecke App_5 mit tollen Frühlings-Emojis 🌸🐥🌳. Neue Funktionen warten auf dich! Kein Rabatt verfügbar, also jetzt herunterladen und loslegen."" (Happy Spring, William! Discover App\_5 with lovely Spring emojis 🌸🐥🌳. New features are waiting for you! No discount available, so download and start now.)', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00427}]}","{'prompt': '<s>[INST] Please promote app App_2, during Summer to the user Jessica Smith who is 44 years old. The discount is 0.0. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is False . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 0. If the value is greater than 7, then urge the user to use the app. If 0 is greater than 0, then include in the message the 0 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (44 years old) and Summer season. Write 1 to 2 short sentences in Spanish to promote the app App_2 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'Jessica Smith is a 44 years old user of the app App_2. They are interested in promotions during the Summer season. The user has a discount of 0.0% for members. The app is not installed, and they last opened the app 0 days ago. There are 0 hours remaining for an additional bonus. Communication should be in Spanish.'}","[{'response': ' ""Hey Jessica! Enjoy the sun with App\_2 and unlock new features. No discount available this summer. ¡Vive el verano con App\_2 y descubre cosas nuevas!¡Disfruta del sol con App\_2 y aprovecha al máximo tu tiempo!""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00432}]}","{'prompt': '<s>[INST] Please promote app App_2, during Winter to the user John Wilson who is 62 years old. The discount is 0.2. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is True . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 12. If the value is greater than 7, then urge the user to use the app. If 1 is greater than 0, then include in the message the 1 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (62 years old) and Winter season. Write 1 to 2 short sentences in Spanish to promote the app App_2 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'John Wilson is a 62 years old user of the app App_2. They are interested in promotions during the Winter season. The user has a discount of 20.0% for members. The app is installed, and they last opened the app 12 days ago. There are 1 hours remaining for an additional bonus. Communication should be in Spanish.'}","[{'response': ' ""John, enjoy a 20% discount on App\_2 this Winter! Hurry, only 1 hour left for extra bonus. Use it soon, it's been 12 days since you last opened. ❄️🎁➡️ \"" \""John, ¡obtén un 20% de descuento en App\_2 este invierno! Date prisa, ¡solo queda 1 hora para un bono extra! Úsalo pronto, han pasado 12 días desde la última vez que lo abriste. ❄️🎁➡️\""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00234}]}","{'prompt': '<s>[INST] Please promote app App_1, during Fall to the user Sophia Williams who is 63 years old. The discount is 0.2. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is True . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 19. If the value is greater than 7, then urge the user to use the app. If 4 is greater than 0, then include in the message the 4 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (63 years old) and Fall season. Write 1 to 2 short sentences in German to promote the app App_1 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'Sophia Williams is a 63 years old user of the app App_1. They are interested in promotions during the Fall season. The user has a discount of 20.0% for members. The app is installed, and they last opened the app 19 days ago. There are 4 hours remaining for an additional bonus. Communication should be in German.'}","[{'response': ' ""Sophia, enjoy 20% off on App\_1 this Fall! Hurry, only 4 hours left for extra bonus. Open your app now, it's been 19 days since you last used it. 🍁🍂🍃 ""Sophia, sparen Sie 20% mit App\_1 diesen Herbst! Beeilen Sie sich, nur noch 4 Stunden für zusätzlichen Bonus. Öffnen Sie Ihre App jetzt, seit Ihrem letzten Besuch sind 19 Tage vergangen. 🍁🍂🍃""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00255}]}","{'prompt': '<s>[INST] Please promote app App_3, during Fall to the user Jessica Williams who is 52 years old. The discount is 0.2. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is True . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 16. If the value is greater than 7, then urge the user to use the app. If 2 is greater than 0, then include in the message the 2 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (52 years old) and Fall season. Write 1 to 2 short sentences in French to promote the app App_3 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'Jessica Williams is a 52 years old user of the app App_3. They are interested in promotions during the Fall season. The user has a discount of 20.0% for members. The app is installed, and they last opened the app 16 days ago. There are 2 hours remaining for an additional bonus. Communication should be in French.'}","[{'response': ' ""Jessica, with your App_3 subscription, enjoy a 20% fall discount! Plus, only 2 hours left for extra bonus. Use the app soon, it's been 16 days since you last opened it. 🍂🎃👵️"" ""Madame Williams, profitez d'une remise de 20% sur App_3 cet automne! De plus, il ne vous reste que 2 heures pour bénéficier d'un bonus supplémentaire. Utilisez l'application bientôt, cela fait 16 jours que vous ne l'avez pas ouverte. 🍁🍂🎃""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.0011}]}","{'prompt': '<s>[INST] Please promote app App_2, during Fall to the user Emily Wilson who is 34 years old. The discount is 0.0. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is False . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 0. If the value is greater than 7, then urge the user to use the app. If 0 is greater than 0, then include in the message the 0 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (34 years old) and Fall season. Write 1 to 2 short sentences in French to promote the app App_2 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'Emily Wilson is a 34 years old user of the app App_2. They are interested in promotions during the Fall season. The user has a discount of 0.0% for members. The app is not installed, and they last opened the app 0 days ago. There are 0 hours remaining for an additional bonus. Communication should be in French.'}","[{'response': ' ""Hey Emily, try our App\_2 now with no discount! 🍂🎃 Autumn is the perfect time to discover its features. Plus, you have 0 hours left for an extra bonus! 🎁 Ouvrez App\_2 dès maintenant et découvrez ses fonctionnalités! 🇫🇷""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00354}]}","{'prompt': '<s>[INST] Please promote app App_5, during Winter to the user Emily Brown who is 55 years old. The discount is 0.0. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is False . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 0. If the value is greater than 7, then urge the user to use the app. If 0 is greater than 0, then include in the message the 0 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (55 years old) and Winter season. Write 1 to 2 short sentences in French to promote the app App_5 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'Emily Brown is a 55 years old user of the app App_5. They are interested in promotions during the Winter season. The user has a discount of 0.0% for members. The app is not installed, and they last opened the app 0 days ago. There are 0 hours remaining for an additional bonus. Communication should be in French.'}","[{'response': ' ""Emily, bundle up and try App\_5 today! No discount available, but the rewards are worth it. You've missed 0 hours of bonus features. ❄️🎁⛄️"" ""Emily, profitez de l'hiver en découvrant App\_5! Aucun rabais disponible, mais les récompenses en valent la peine. Vous avez manqué 0 heures de fonctionnalités bonus. ❄️🎁⛄️""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00125}]}","{'prompt': '<s>[INST] Please promote app App_1, during Winter to the user Olivia Williams who is 39 years old. The discount is 0.15. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is True . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 15. If the value is greater than 7, then urge the user to use the app. If 10 is greater than 0, then include in the message the 10 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (39 years old) and Winter season. Write 1 to 2 short sentences in French to promote the app App_1 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'Olivia Williams is a 39 years old user of the app App_1. They are interested in promotions during the Winter season. The user has a discount of 15.0% for members. The app is installed, and they last opened the app 15 days ago. There are 10 hours remaining for an additional bonus. Communication should be in French.'}","[{'response': ' ""Olivia, great news! App_1 is ready for you to explore this Winter. Enjoy a 15% discount as a thank you for being a valued user. Only 10 hours left for an extra bonus! ❄️🎁 Use your app and let it shine! 🌟 French: Olivia, bonnes nouvelles! Profitez de 15 % de réduction sur App_1 cet hiver. Soyez rapide, il ne reste plus que 10 heures pour un bonus supplémentaire! ❄️🎁 Utilisez votre application dès maintenant! 🌟""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"
"{'scores': [{'metricName': 'Toxicity', 'result': 0.00166}]}","{'prompt': '<s>[INST] Please promote app App_1, during Winter to the user Sarah Smith who is 23 years old. The discount is 0.0. For example, a discount of 0.5 means 50%. If the discount equals 0.0, then no discount is available. The value for app_installed is False . If the value is False, then include the discount in the push notification message if if the discount is greater than 0.0. The value for app_opened_last is 0. If the value is greater than 7, then urge the user to use the app. If 0 is greater than 0, then include in the message the 0 hours remaining for an additional bonus. Include 1 to 3 emoji's in the message that reflect the age of the person (23 years old) and Winter season. Write 1 to 2 short sentences in French to promote the app App_1 to this customer. Only return the sentence and nothing else. [/INST] ', 'referenceResponse': 'Sarah Smith is a 23 years old user of the app App_1. They are interested in promotions during the Winter season. The user has a discount of 0.0% for members. The app is not installed, and they last opened the app 0 days ago. There are 0 hours remaining for an additional bonus. Communication should be in French.'}","[{'response': ' ""Hey Sarah, try App\_1 today with its winter-fresh features! No discount available, but download now for a cool experience ❄️❄️� snowflake snowflake. And if you've been away for more than 7 hours, you've got 0 hours left to claim your bonus! 🎉. Bon appétit with App\_1! 🍲"" ""Sarah, découvrez App\_1 aujourd'hui et plongez dans une expérience hivernale! Pas de réduction disponible, mais téléchargez dès maintenant pour un résultat cool! ❄️❄️. Si vous vous êtes absenté(e) pendant plus de 7 heures, vous n'avez plus que 0 heure pour bénéficier de votre bonus! 🎉. Bon appétit avec App\_1! 🍲""', 'modelIdentifier': 'mistral.mixtral-8x7b-instruct-v0:1'}]"


In [25]:
# 3. LLM as a Judge

# 3a. Define rubric
RUBRIC = '''
- The message should have a friendly tone.
- The message can not include more than 3 emoji. 
- If the user has not installed the app, the message should encourage the user to install it by highlighting the discounted price if there is a discount available.
- If the user has not opened the app within the last 7 days, the message should highlight the promotional discount and how the remaing time for this discount to be available.
- The message should be less than 140 words.
- The message should be in the language as specified in the prompt.
'''

In [26]:
# 3b. Define a "grader prompt" template
 
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages.base import BaseMessage

def build_grader_prompt(original_instruction: str, marketing_message: str) -> BaseMessage:
    prompt = """You will be provided with the original instructions to generate the marketing push notification, a marketing push notification,
                and a rubric that instructs you on what makes this notification correct or incorrect.

    Here is the original instruction for the push notification.
    <original_instruction>
    {original_instruction}
    </original_instruction>

    Here is the generated push notification based on these instructions.
    <notification>
    {marketing_message}
    </notification>
    
    Here is the rubric on how to grade the generated notification.
    <rubric>
    {rubric}
    </rubric>
    
    An answer is correct if it entirely meets the rubric criteria, and is otherwise incorrect.
    First, think through whether the answer is correct or incorrect based on the rubric inside <thinking></thinking> tags. Then, output either 'correct' if the answer is correct or 'incorrect' if the answer is incorrect inside <correctness></correctness> tags.
    """

    # First we will generate a prompt template using Langchain and the prompt above
    chat_template: ChatPromptTemplate = ChatPromptTemplate.from_messages([
        ("human", prompt)
    ])
        
    # Next, we will insert all the variables into into the prompt. 
    return chat_template.format_messages(
        marketing_message=marketing_message,
        rubric=RUBRIC,
        original_instruction=original_instruction
    ) 

In [27]:
# 3c. Define Helper Functions

from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import boto3
import time

from langchain_community.chat_models import BedrockChat
from langchain_core.messages.ai import AIMessage

def call_bedrock(request: BaseMessage):
    client = BedrockChat(
        model_id= "anthropic.claude-3-sonnet-20240229-v1:0", # LLM Judge
        model_kwargs= {"temperature": 0, "top_k": 500}
    )
    
    response = client.invoke(request)
    return response

# This is a bit funky. We're dumping all the requests into a thread pool
# And storing the index for the order in which they were submitted. 
# Lastly, we're inserting them into the response array at their index to ensure order.
def call_threaded(requests, function):
    # Dictionary to map futures to their position
    future_to_position = {}
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        # Submit all requests and remember their order
        for i, request in enumerate(requests):
            future = executor.submit(function, request)
            future_to_position[future] = i
        
        # Initialize an empty list to hold the responses
        responses = [None] * len(requests)
        
        # As each future completes, assign its result to the correct position
        for future in as_completed(future_to_position):
            position = future_to_position[future]
            try:
                response: AIMessage = future.result()
                responses[position] = response.content
            except Exception as exc:
                print(f"Request at position {position} generated an exception: {exc}")
                responses[position] = None  # Or handle the exception as appropriate
        
    return responses

def conversation_to_str(conversation: list[dict]) -> str:
    return ''.join([f"{c['type']}: {c['text']}" for c in conversation])

import re
import json

REASONING_PATTERN = r'<thinking>(.*?)</thinking>'
CORRECTNESS_PATTERN = r'<correctness>(.*?)</correctness>'

# Strip out the portion of the response with regex.
def extract_with_regex(response, regex):
    matches = re.search(regex, response, re.DOTALL)
    # Extract the matched content, if any
    return matches.group(1).strip() if matches else None

def format_results(grade: str, chat_conversation: list[dict]) -> dict:
    reasoning: str = extract_with_regex(grade, REASONING_PATTERN)
    correctness: str =  extract_with_regex(grade, CORRECTNESS_PATTERN)
    
    return {
        'chat_conversation': chat_conversation,
        'reasoning': reasoning,
        'correctness': correctness
    }

# helper function to parse string into valid json as smaller LLMs struggle to produce JSON, 
# as well as with translations we will have different output structures
 
import json

def parse_string_to_json(input_string):
    try:
        # Remove all quotes
        fixed_str = input_string.replace("'", '').replace('"', '')

        # Split the input string into two parts: the message and the translation
        parts = fixed_str.split('(Google Translate: ', 1)
        if len(parts) == 2:
            message, translation = parts
            message = message.strip()
            # print(f"message: {message}")
            translation = translation.strip('")')
            # print(f"translation: {translation}")

            
        else:
            message = fixed_str
            translation = ""
        
        # Create the JSON object
        json_obj = {
            "message": message,
            "translation": translation
        }

        # Convert the JSON object to a string
        json_string = json.dumps(json_obj, ensure_ascii=False)

        return json_string

    except ValueError as e:
        print(f"Error {e}: Invalid input string format.")
        return None

In [28]:
# 3d. Generate responses and construct grader prompts from the chat conversations

import logging
import boto3
from botocore.exceptions import ClientError


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

client = boto3.client("bedrock-runtime")
# # Setup the system prompts and messages to send to the model.
# system_prompts = [{"text": "You are an app that creates short push notifications for mobile phones to drive engagement"
#                     "Only return a personalized push notification and nothing else."}]


# Inference parameters to use.
temperature = 0
top_k = 200
inference_config = {"temperature": temperature}
additional_model_fields = {"top_k": top_k}
model_id = 'mistral.mixtral-8x7b-instruct-v0:1' # mistral.mixtral-8x7b-instruct-v0:1 / mistral.mistral-7b-instruct-v0:2 / amazon.titan-text-lite-v1:0:4k


# Define S3 bucket and object key
s3_bucket_name = S3_BUCKET_NAME
jsonl_file_path = 'data.jsonl'
object_key = 'custom-datasets/data.jsonl'
region = REGION


# Create an S3 client
s3_client = boto3.client('s3')

# Download the file from S3
try:
    s3_client.download_file(s3_bucket_name, object_key, jsonl_file_path)
    print(f"File downloaded from s3://{s3_bucket_name}/{object_key} to {jsonl_file_path}")
except Exception as e:
    print(f"Error downloading file: {e}")

promptrows = []

# Iterate over the JSONL file row by row
try:
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            # Parse the line as a JSON object
            data = json.loads(line)
            messages = [{"role": "user", "content": [{"text": data['prompt']}]}]
            response = client.converse(
                modelId=model_id,
                messages=messages,
                # system=system_prompts,
                inferenceConfig=inference_config,
                additionalModelRequestFields=additional_model_fields
            )
            llm_output = response["output"]["message"]["content"][0]["text"]

            promptrows.append({"prompt":data['prompt'], "category":data['category'], "response": llm_output ,"referenceResponse": data['referenceResponse']})

except Exception as e:
    print(f"Error reading JSONL file: {e}")


grader_prompts = []
for row in promptrows:  
    conversation_str = row['response']
    original_instruction = row['prompt']
    prompt: BaseMessage = build_grader_prompt(original_instruction, conversation_str)
    grader_prompts.append(prompt)

File downloaded from s3://marketing-gen-eval/custom-datasets/data.jsonl to data.jsonl


In [29]:
# 2e. Call Bedrock threaded to speed up getting all our responses. The results should come back in order.
evaluation_results: list[str] = call_threaded(grader_prompts, call_bedrock)


  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


In [30]:
# 2f. load results into dataframe for easier processing
import json
import pandas as pd

formatted_results = []
for i,g in enumerate(evaluation_results):
    # print(f'i: {i}')
    # print(f'{promptrows[i]["response"]}')
    # print(f'g: {g}')
    generated_message = json.loads(parse_string_to_json(promptrows[i]["response"])).get("message")
    # print(f'generated_message: {generated_message}')
    reasoning = extract_with_regex(evaluation_results[i], REASONING_PATTERN)
    # print(f'reasoning: {reasoning}')
    correctness =  extract_with_regex(evaluation_results[i], CORRECTNESS_PATTERN)
    # print(f'correctness: {correctness}')
    formatted_results.append({"generated_message": generated_message, "reasoning": reasoning, "correctness": correctness})

evaluated_df = pd.DataFrame(formatted_results) 


In [35]:
# 2g. Review results: Pecentage correct
percentage_correct = evaluated_df['correctness'].value_counts(normalize=True)['correct'] * 100
print(f"Percentage correct: {percentage_correct:.2f}%")

Percentage correct: 19.00%


In [36]:
# 2h. Review results: Sample a subsection of 10 incorrect responses
from IPython.display import display, HTML

# Assuming you have a dataframe called 'df' with a column called 'result'
incorrect_rows = evaluated_df[evaluated_df['correctness'] == 'incorrect'].sample(n=10)

from IPython.display import display, HTML

# Convert the dataframe to an HTML table
table_html = incorrect_rows.to_html(index=False, classes='table table-striped')

# Display the HTML table
display(HTML(table_html))

generated_message,reasoning,correctness
"Jessica, try App\_3 today with no discount, but full of features! With app\_opened\_last at 0, heres a reminder to start using it. Spring is here, so is a new app adventure! 🌸🌼🌺\nUtilisez dès maintenant lapplication App\_3 sans réduction, mais pleine de fonctionnalités! Avec app\_opened\_last à 0, cest le moment de commencer à lutiliser. Le printemps est là, une nouvelle aventure dapplication vous attend!","The generated notification meets some of the rubric criteria but not all:\n\nPositives:\n- The tone is friendly and encouraging the user to try the app.\n- The number of emojis (3) is within the allowed range of 1-3.\n- The message is in the specified language (English and French).\n- The message length is less than 140 words.\n\nNegatives:\n- Since the app is not installed (app_installed = False) and there is no discount (discount = 0.0), the message should have encouraged the user to install the app by highlighting the discounted price, which it did not do.\n- The message did not highlight any promotional discount or remaining time for the discount, which it should have done since app_opened_last = 0 (less than 7 days).\n\nOverall, while the notification has some positive aspects, it does not fully meet the rubric criteria due to the missed requirements around promoting installation with a discount (if available) and highlighting promotional discounts/remaining time for users who have not opened the app recently.",incorrect
"Michael, enjoy 20% off on App\_3! Hurry, only 1 hour left for extra bonus. Use your app again, 57 is just a number, make it fun! Grins and falling leaves 🍁😃🍂\n\nMichael, sparen Sie 20% auf App\_3! Schnell, nur noch 1 Stunde für zusätzlichen Bonus. Verwenden Sie Ihre App erneut, 57 ist nur eine Zahl, machen Sie es lustig! Grinsen und herabfallende Blätter 🍁😃🍂","The generated notification meets most of the rubric criteria:\n\n- The tone is friendly and conversational.\n- It includes 3 emojis, which is within the allowed limit.\n- The user has installed the app (app_installed is True), so there is no need to encourage installation.\n- The user has not opened the app within the last 7 days (app_opened_last is 20), so the message highlights the promotional discount and the remaining time for the discount.\n- The message is less than 140 words.\n- The message includes both English and German sentences as specified in the prompt.\n\nHowever, there is one issue: the rubric states that the message should be in the language specified in the prompt, but the prompt asks for 1 to 2 short sentences in German, while the generated notification includes both English and German sentences.",incorrect
"Hi William, with your App\_3 subscription, youre eligible for a 20% discount! Plus, with 9 hours left, grab the summer bonus. Dive into the app and enjoy your sunny days! ☀️🏖️🕶️","The generated push notification meets the following criteria from the rubric:\n\n- The message has a friendly tone, addressing the user by name and using a positive, conversational style.\n- The message includes 3 emojis, which is within the allowed range of 1 to 3 emojis.\n- The message highlights the 20% discount available for the user's App_3 subscription.\n- The message mentions the remaining 9 hours for an additional bonus, encouraging the user to take advantage of this limited-time offer.\n- The message is written in English, as specified in the prompt.\n\nHowever, the message does not meet the following criterion:\n\n- The message is longer than 140 words.\n\nSince the message meets most of the rubric criteria but exceeds the word limit, it can be considered partially correct.",incorrect
"Hey Emily! 🍁 Fall is here and so is our app, App\_3. No discount available this time, but its packed with features youll love. Plus, with 0 hours left, dont miss out on the special bonus! ¡Vamos a probar App\_3 hoy mismo! ¡Es tu oportunidad de disfrutar de lo mejor de la caída! 🍂","The generated notification meets some of the rubric criteria but not all:\n\nPositives:\n- The tone is friendly and conversational\n- It includes 2 emojis related to Fall and the user's age\n- It includes 1-2 short sentences in Spanish promoting the app\n- The message is under 140 words\n\nNegatives:\n- It does not encourage the user to install the app since app_installed is False\n- It mentions a discount of 0.0 which is incorrect since there is no discount available\n- It mentions a bonus with 0 hours remaining, which does not make sense since the condition (0 > 0) is false\n- It does not highlight any promotional discount or remaining time since app_opened_last is 0 (less than 7 days)\n\nOverall, while the notification has some positive aspects, it does not fully meet the rubric criteria due to the incorrect information about discounts, bonuses, and not properly addressing the app installation/usage status.",incorrect
"Hey Christopher, try App\_3 today with no discount! Plus, if you open the app now, youll have 0 hours left to claim a special bonus ☀️🌴\nRemember, you havent used the app in a while, so dont miss out! #App_3Summer\n\nBonjour Christopher, découvrez App\_3 dès maintenant et bénéficiez daucun rabais! Profitez également dune offre spéciale avec 0 heure restante ☀️🌴 Noubliez pas douvrir lapplication régulièrement! #App_3Été","The generated notification meets some of the rubric criteria but not all:\n\nPositives:\n- The tone is friendly and conversational\n- It includes 1-3 emojis relevant to the age and season\n- It includes 1-2 short sentences in French promoting the app\n- The length is under 140 words\n\nNegatives:\n- It does not encourage the user to install the app since app_installed is False\n- It mentions a discount of 0.0 which is incorrect since no discount should be mentioned if the discount is 0.0\n- It mentions a remaining time of 0 hours for a bonus, which is incorrect since the condition for including that was not met (0 is not greater than 0)\n- It does not urge the user to open the app since app_opened_last is 0 (less than 7 days ago)\n\nOverall, while the notification has some positive aspects, it does not fully meet the rubric criteria due to the negatives listed above.",incorrect
"Jessica, try our App\_3 today with no discount! 🍂🍁 Autumn is here, and our app is better than ever. Plus, if you open it now, youll have 0 hours left to claim a special bonus! ¡Vamos a por ello, Jessica! Disfruta de App\_3 hoy mismo.","Based on the rubric:\n\n- The message has a friendly tone, which meets the criteria.\n- The message includes 2 emojis, which meets the criteria of not having more than 3 emojis.\n- The value for app_installed is False, but the message does not encourage the user to install the app or highlight the discount, which violates the criteria.\n- The value for app_opened_last is 0, which is less than 7, so the message does not need to urge the user to use the app.\n- The message is less than 140 words, which meets the criteria.\n- The message includes 1-2 short sentences in Spanish to promote the app, which meets the criteria.\n\nOverall, the message violates one of the criteria by not encouraging the user to install the app or highlighting the discount when the app is not installed.",incorrect
"Hey Olivia, great news! Youve got a 15% discount on App\_3. Use it and explore its amazing features. Only 4 hours left for an extra bonus! 🌸💃🏼🎉\n\nOlivia, profitez de 15 % de réduction sur App\_3! Dépêchez-vous, il ne reste que 4 heures pour bénéficier dun bonus supplémentaire! 🌸💃🏼🎉","The generated notification meets most of the rubric criteria:\n\n- The tone is friendly and conversational.\n- It includes 3 emojis, which is within the allowed limit.\n- The user has installed the app (app_installed is True), so there is no need to encourage installation.\n- The user has not opened the app within the last 25 days (app_opened_last is 25, which is greater than 7), so the message highlights the promotional discount and the remaining time for the discount.\n- The message is less than 140 words.\n- The message includes 1-2 short sentences in French to promote the app, as instructed.\n\nHowever, there is one issue: the rubric states that if the discount is greater than 0.0, the message should include the discount only if the user has not installed the app. In this case, since the user has installed the app (app_installed is True), the message should not include the discount.",incorrect
Schneeflocke-Sale! App\_4-Deal: 20% off! Öffne deine App in den nächsten 4 Stunden für extra Bonus. Freu dich auf Winterspaß! ❄️🎁❄️\n\n(Translation: Snowflake sale! App\_4 deal: 20% off! Open your app in the next 4 hours for extra bonus. Enjoy winter fun! ❄️🎁❄️),"Based on the rubric:\n\n- The message has a friendly tone, which meets the criteria.\n- The message includes 3 emojis, which meets the criteria.\n- The value for app_installed is True, so the message does not need to encourage the user to install the app.\n- The value for app_opened_last is 4, which is less than 7, so the message does not need to highlight the remaining time for the discount.\n- The message is less than 140 words.\n- The message includes 1-2 short sentences in German to promote the app, as specified in the prompt.\n\nHowever, there is one issue with the notification:\n- The original instruction states that if 0 is greater than 0, then the message should include the 0 hours remaining for an additional bonus. Since 0 is not greater than 0, this condition is not met, but the notification incorrectly includes ""Open your app in the next 4 hours for extra bonus.""",incorrect
"Hey Emily, try our App\_2 now with no discount! 🍂🎃Fall in love with our features. Plus, youve been away for 0 hours, so get back in the game! 🕒 En français: Essayez App\_2 dès maintenant sans réduction! 🍁🍂 Tombez amoureuse de nos fonctionnalités. De plus, vous êtes absent depuis 0 heure, alors revenez dans le jeu!","Based on the rubric, the generated notification has the following issues:\n\n1. The tone is friendly, which meets the criteria.\n2. The notification includes 3 emojis, which meets the criteria.\n3. The value for app_installed is False, but the message does not encourage the user to install the app or highlight the discount (which is 0.0, so no discount is available).\n4. The value for app_opened_last is 0, which is less than 7 days, so the message does not need to highlight the promotional discount or remaining time.\n5. The message is less than 140 words, which meets the criteria.\n6. The message includes 1-2 short sentences in French, which meets the criteria.\n\nOverall, the notification fails to meet the criteria of encouraging the user to install the app since app_installed is False and there is no discount available.",incorrect
"Hey Jessica! Spring is here and so is our app App\_5. No discount available this time, but its packed with features to enjoy the season. You last opened it 0 hours ago, so keep up the good work! 🌸🌼🌺 \n\n¡Aprovecha la primavera con nuestra app App\_5! No te pierdas de sus increíbles funciones.","Based on the rubric:\n\n- The message has a friendly tone, which meets the criteria.\n- The message includes 3 emojis, which meets the criteria.\n- The user has installed the app (app_installed is False), so there is no need to encourage installation. However, the message should have included the discount if there was one (discount is 0.0, so no discount).\n- The user last opened the app 0 days ago (app_opened_last is 0), so there is no need to highlight a promotional discount or remaining time.\n- The message is less than 140 words, which meets the criteria.\n- The message includes 1-2 short sentences in Spanish to promote the app, which meets the criteria.\n\nOverall, the notification meets most of the criteria, but it should have mentioned the lack of a discount since the discount value is 0.0.",incorrect


In [37]:
# 2i. Review results: Sample a subsection of 10 correct responses
from IPython.display import display, HTML

# Assuming you have a dataframe called 'df' with a column called 'result'
correct_rows = evaluated_df[evaluated_df['correctness'] == 'correct'].sample(n=10)

from IPython.display import display, HTML

# Convert the dataframe to an HTML table
table_html = correct_rows.to_html(index=False, classes='table table-striped')

# Display the HTML table
display(HTML(table_html))

generated_message,reasoning,correctness
"Emily, great news! 🍂🍁 App_1 is ready for you to use with a 15% discount. 🎉 Hurry, only 2 hours left for extra bonus! ⏳ Abra la app y disfrute de una experiencia única. ¡Descargue App_1 hoy!","Based on the rubric:\n\n- The message has a friendly tone, addressing the user by name and using positive language like ""great news"" and ""hurry"".\n- The message includes 3 emojis, which is within the allowed limit.\n- Since the value for app_installed is True, the message does not need to encourage the user to install the app.\n- The value for app_opened_last is 28, which is greater than 7, so the message correctly urges the user to use the app.\n- The message includes the 15% discount and mentions the 2 hours remaining for an additional bonus, as per the instructions.\n- The message is in two languages, English and Spanish, as specified in the prompt.\n- The message appears to be less than 140 words.\n\nOverall, the generated notification meets all the criteria specified in the rubric.",correct
"David, with our special 10% discount, nows the time to enjoy App\_2! Open it now, you have 4 hours left for extra bonus. 🍂🍁🍏 French: David, profitez de notre remise spéciale de 10% sur App\_2! Ouvrez-le maintenant, il ne vous reste plus que 4 heures pour un bonus supplémentaire. 🍂🍁🍏","The generated notification meets most of the rubric criteria:\n\n- The tone is friendly and conversational.\n- It includes 3 emojis, which is within the allowed limit.\n- The user has installed the app (app_installed is True), so there is no need to encourage installation.\n- The user has not opened the app within the last 7 days (app_opened_last is 28), so the message highlights the promotional discount and remaining time.\n- The message is less than 140 words.\n- The message includes 1-2 short sentences in French to promote the app, as instructed.\n\nHowever, there is one issue: the rubric states that if the user has not installed the app and there is a discount available, the message should highlight the discounted price. In this case, the user has installed the app, so this criterion does not apply.\n\nOverall, the notification meets all the relevant criteria from the rubric.",correct
"Hey William (@smileyman)! Try app App\_5 today and enjoy the sunny vibes ☀️. No discount available this time, but the fun is always at full price! Use it soon, you have 0 hours left for a special bonus 🎁. In French: Essayez lapp App\_5 dès aujourdhui et captez les vibrations de lété 🌞. Pas de réduction cette fois, mais le plaisir est toujours au prix fort!","Based on the rubric, here are my thoughts on whether the generated notification is correct or incorrect:\n\n- The message has a friendly tone, addressing the user by name and using a positive, conversational style.\n- The message includes 3 emojis, which is within the allowed limit.\n- Since the value for app_installed is False, the message should encourage the user to install the app by highlighting the discount if available. However, the discount is 0.0, so this criterion is not applicable.\n- The value for app_opened_last is 0, which is less than 7. Therefore, the message does not need to urge the user to use the app.\n- The message includes the 0 hours remaining for an additional bonus, as instructed.\n- The message includes 1-2 short sentences in French to promote the app, as requested.\n- The message appears to be less than 140 words.\n\nOverall, the generated notification meets all the criteria specified in the rubric.",correct
"Hey David, with your App\_2 installation, enjoy a 15% discount! Open the app now, as its been 20 days since your last visit. Plus, 7 hours remain for a special Spring bonus! 🌸🌼🎁\n\nApp\_2 installiert? Nutzen Sie jetzt 15% Rabatt! Öffnen Sie die App noch heute, da es 20 Tage her ist, seit Ihrem letzten Besuch. Zusätzlich haben Sie in dieser Frühlingszeit noch 7 Stunden für ein besonderes Bonusangebot!","The generated notification meets most of the rubric criteria:\n\n- The tone is friendly and conversational.\n- It includes 3 emojis, which is within the allowed limit.\n- The user has installed the app (app_installed=True), so there is no need to encourage installation.\n- The user has not opened the app in 20 days (app_opened_last=20), which is more than 7 days, so the message highlights the promotional discount and remaining time for the discount.\n- The message is 44 words, which is less than 140 words.\n- The message includes 1-2 short sentences in German to promote the app, as instructed.\n\nHowever, there is one issue: the rubric states that if the discount is 0.0 (no discount available), the message should not include the discount if the app is not installed. In this case, the discount is 0.15 (15%), and the app is installed, so the discount should be included.\n\nOverall, the notification meets the rubric criteria and is correct.",correct
"Hey Jessica, enjoy a 10% discount on App_1 this Winter! With app\_installed as True, head to the app to claim your deal. Plus, with 20 app opens recently, keep up the great work and unlock 5 more hours of bonus features! ❄️🎁🎊\n\nUtilisez lapp App\_1 et bénéficiez dune remise de 10% cet hiver! Avec lapplication déjà installée, profitez de cette offre exclusive dès maintenant. ❄️\n\n(Note: The discount amount and emojis are just examples and may vary based on actual values and promotions.)","The generated notification meets most of the rubric criteria:\n\n- It has a friendly tone.\n- It includes 3 emojis, which is within the allowed limit.\n- Since app_installed is True, it does not need to encourage the user to install the app.\n- Since app_opened_last is 20 (greater than 7), it urges the user to keep using the app to unlock bonus features.\n- The message is less than 140 words.\n- It includes 1-2 short sentences in French to promote the app, as instructed.\n\nHowever, there is one potential issue:\n- The rubric states that if the user has not opened the app within the last 7 days, the message should highlight the promotional discount and remaining time for the discount. Since app_opened_last is 20 (greater than 7), this criterion does not apply in this case.\n\nOverall, the generated notification meets all the relevant criteria based on the given instructions and rubric.",correct
"¡Buenas noticias, Michael! Aprovecha un 15% de descuento en App\_2. Además, 9 horas más para obtener un bono extra. ¡Abre la app y disfruta de esta oferta otoñal 🍁🍂🍁!\n\nTranslation: Good news, Michael! Take advantage of a 15% discount on App\_2. Plus, 9 more hours to get an additional bonus. Open the app and enjoy this fall offer 🍁🍂🍁!","The generated notification meets most of the rubric criteria:\n\n- The tone is friendly and conversational.\n- It includes 3 emojis, which is within the allowed limit.\n- The user has installed the app (app_installed is True), so there is no need to encourage installation.\n- The user has not opened the app within the last 7 days (app_opened_last is 9), so the message highlights the promotional discount and the remaining time for the discount.\n- The message is less than 140 words.\n- The message includes 1-2 short sentences in Spanish to promote the app, as instructed.\n\nHowever, there is one potential issue: the rubric states that if the user has not installed the app and there is a discount available, the message should highlight the discounted price. In this case, the user has installed the app, so this criterion does not apply.\n\nOverall, the generated notification meets all the relevant criteria in the rubric.",correct
"Hey David! Great news, App\_1s just 0.1 off! Plus, with 10 hours left, grab an extra bonus. And dont forget to dive into the fun this Summer 🌞🏖️. See you in the app, now open and waiting for you!","Based on the rubric:\n\n- The message has a friendly tone, which meets the criteria.\n- The message includes 3 emojis, which meets the criteria.\n- The value for app_installed is True, so the message does not need to encourage the user to install the app.\n- The value for app_opened_last is 6, which is less than 7, so the message does not need to highlight the promotional discount and remaining time.\n- The message is less than 140 words.\n- The message is in English, as specified in the prompt.\n\nThe notification meets all the criteria in the rubric.",correct
"Hey Christopher, with your App_4 installed, enjoy a 20% fall discount! Plus, with 5 hours left, grab the bonus before its gone. Log in now and make the most of this season. 🍂🎃🎉\n\nUtilisez App_4 et bénéficiez dune remise de 20% cet automne! Ne manquez pas non plus le bonus avec seulement 5 heures restantes. Connectez-vous maintenant et profitez au maximum de cette saison.","The generated notification meets most of the rubric criteria:\n\n- The tone is friendly and conversational.\n- It includes 3 emojis, which is within the allowed limit.\n- The user has the app installed, so there is no need to encourage installation.\n- The message highlights the 20% discount and the remaining 5 hours for the bonus, which is appropriate since the user has not opened the app in the last 30 days (more than 7 days).\n- The message is less than 140 words.\n- It includes 1-2 short sentences in French to promote the app, as instructed.\n\nHowever, there is one potential issue: the rubric states that if the discount is 0.0 (no discount), the message should not include the discount for users who have not installed the app. In this case, the discount is 0.2 (20%), so this criterion does not apply.\n\nOverall, the notification meets all the relevant criteria in the rubric.",correct
"Hey David! Your favorite App_3 is here with a sweet summer offer of 10% off!☀️ Hurry, only 7 hours left for an extra bonus. Use your app now, as you last opened it 10 hours ago. ¡Disfruta App_3 este verano! #App3SummerSale 🍹🌴🎊","Based on the rubric:\n\n- The message has a friendly tone, addressing the user by name and using an encouraging tone.\n- The message includes 3 emojis, which is within the allowed limit.\n- Since the value for app_installed is True, there is no need to encourage the user to install the app.\n- The value for app_opened_last is 10, which is greater than 7, so the message correctly urges the user to use the app.\n- The message includes the remaining time (7 hours) for an additional bonus, as instructed.\n- The message includes 1-2 short sentences in Spanish to promote the app, as requested.\n- The message appears to be less than 140 words.\n\nOverall, the generated notification meets all the criteria specified in the rubric.",correct
"🎉 Summers here, Emily! Open App_3 & enjoy 20% off 💃 Plus, 9 hours left for extra bonuses! Keep the summer vibes going ☀","The generated push notification meets most of the rubric criteria:\n\n- The message has a friendly tone.\n- It includes 3 emojis, which is within the allowed limit.\n- The user has installed the app (app_installed=True), so there is no need to encourage installation.\n- The user has opened the app within the last 7 days (app_opened_last=6), so the message highlights the promotional discount and remaining time for the discount.\n- The message is less than 140 words.\n- The message is in English, as specified in the prompt.\n\nHowever, there is one issue: the rubric states that if the user has not installed the app and there is a discount available, the message should encourage the user to install it by highlighting the discounted price. In this case, the user has installed the app, so this criterion does not apply.\n\nOverall, the generated push notification meets all the applicable rubric criteria.",correct


In [39]:
# 3. Human Eval
# 3a. Setup Flow Definition 
WorkteamArn = WORKTEAM_ARN
SageMakerCustomerRoleArn = SAGEMAKER_ROLE_ARN
region = REGION
s3_bucket_name = S3_BUCKET_NAME


cli_input_json_string = '''
{{
    "FlowDefinitionName": "human-evaluation-task01",
    "HumanLoopRequestSource": {{
        "AwsManagedHumanLoopRequestSource": "AWS/Bedrock/Evaluation"
    }},
    "HumanLoopConfig": {{
        "WorkteamArn": "{WorkteamArn}",
        "HumanTaskUiArn": "arn:aws:sagemaker:{region}:394669845002:human-task-ui/Evaluation",
        "TaskTitle": "Human review tasks",
        "TaskDescription": "Provide a real good answer",
        "TaskCount": 1,
        "TaskAvailabilityLifetimeInSeconds": 864000,
        "TaskTimeLimitInSeconds": 3600,
        "TaskKeywords": ["foo"]
    }},
    "OutputConfig": {{
        "S3OutputPath": "s3://{s3_bucket_name}"
    }},
    "RoleArn": "{SageMakerCustomerRoleArn}"
}}
'''.format(
    WorkteamArn=WorkteamArn,
    s3_bucket_name=s3_bucket_name,
    region=region,
    SageMakerCustomerRoleArn=SageMakerCustomerRoleArn
)

print(cli_input_json_string)



{
    "FlowDefinitionName": "human-evaluation-task01",
    "HumanLoopRequestSource": {
        "AwsManagedHumanLoopRequestSource": "AWS/Bedrock/Evaluation"
    },
    "HumanLoopConfig": {
        "WorkteamArn": "arn:aws:sagemaker:us-east-1:026459568683:workteam/private-crowd/evalteam",
        "HumanTaskUiArn": "arn:aws:sagemaker:us-east-1:394669845002:human-task-ui/Evaluation",
        "TaskTitle": "Human review tasks",
        "TaskDescription": "Provide a real good answer",
        "TaskCount": 1,
        "TaskAvailabilityLifetimeInSeconds": 864000,
        "TaskTimeLimitInSeconds": 3600,
        "TaskKeywords": ["foo"]
    },
    "OutputConfig": {
        "S3OutputPath": "s3://marketing-gen-eval"
    },
    "RoleArn": "arn:aws:iam::026459568683:role/demo-SagemakerNotebookIAMRole-026459568683"
}



In [144]:
filename = "cli_input.json"
with open(filename, 'w') as the_file:
    the_file.write(cli_input_json_string)

In [145]:
# 3b. Create Flow Definition
!aws sagemaker create-flow-definition --cli-input-json file://{filename}

{
    "FlowDefinitionArn": "arn:aws:sagemaker:us-east-1:026459568683:flow-definition/human-evaluation-task01"
}


In [40]:
# 3c. Create Human Eval Job
# output generated from previous cell
flow_definition = "arn:aws:sagemaker:us-east-1:026459568683:flow-definition/human-evaluation-task01"

import boto3
client = boto3.client('bedrock')

job_request = client.create_evaluation_job(
    jobName="human-eval-marketing-push-messages-v2",
    jobDescription="Evaluate marketing messages for two different LLMs",
    roleArn=SageMakerCustomerRoleArn,
    inferenceConfig={
        ## array of models to be evaluated
        "models": [
            {
                "bedrockModel": {
                    "modelIdentifier":f"arn:aws:bedrock:{region}::foundation-model/mistral.mixtral-8x7b-instruct-v0:1",
                    "inferenceParams":"{\"temperature\":\"0.0\", \"topP\":\"1\", \"maxTokenCount\":\"512\"}"
                }
            }
        
            # {
            #     "bedrockModel": {
            #         "modelIdentifier":f"arn:aws:bedrock:{region}::foundation-model/amazon.titan-text-lite-v1",
            #         "inferenceParams":"{\"temperature\":\"0.25\", \"topP\":\"1\", \"maxTokenCount\":\"256\"}"
            #     }

            # },
            
        ]

    },
    outputDataConfig={
        "s3Uri":f"s3://{s3_bucket_name}/outputs/"
    },
    evaluationConfig={
        "human": {
        "humanWorkflowConfig": {
            "flowDefinitionArn": f"{flow_definition}",
            "instructions": "Review the generated Marketing messages."
        },
        "customMetrics": [
            {
                "name": "BrandVoice",
                "description": "Marketing Brand Voice",
                "ratingMethod": "IndividualLikertScale"
            },
            {
                "name": "Accuracy",
                "description": "Are all the details correctly included in the marketing message",
                "ratingMethod": "IndividualLikertScale"
            }
        ],
        "datasetMetricConfigs": [
            {
                "taskType": "Generation",
                "dataset": {
                    "name": "Custom_Dataset1",
                    "datasetLocation": {
                        "s3Uri": f"s3://{s3_bucket_name}/custom-datasets/data.jsonl"
                    }
                },
                "metricNames": [
                  "BrandVoice",
                  "Accuracy"
                ]
            }
        ]
      }

    }
)

print(job_request)

{'ResponseMetadata': {'RequestId': '3fc8e7c8-a6cb-4bcf-875f-a82beb6be3ca', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Tue, 06 Aug 2024 01:35:27 GMT', 'content-type': 'application/json', 'content-length': '79', 'connection': 'keep-alive', 'x-amzn-requestid': '3fc8e7c8-a6cb-4bcf-875f-a82beb6be3ca'}, 'RetryAttempts': 0}, 'jobArn': 'arn:aws:bedrock:us-east-1:026459568683:evaluation-job/oivzsbg3xuff'}


In [42]:
# 3d. List in progress evaluation jobs
from datetime import datetime
response = client.list_evaluation_jobs(
    creationTimeAfter=datetime(2015, 1, 1),
    # creationTimeBefore=datetime(2015, 1, 1),
    statusEquals= 'InProgress', #'InProgress'|'Completed'|'Failed'|'Stopping'|'Stopped',
    # nameContains='string',
    maxResults=123,
    # nextToken='string',
    sortBy='CreationTime',
    sortOrder='Descending'
)
response['jobSummaries']

[{'jobArn': 'arn:aws:bedrock:us-east-1:026459568683:evaluation-job/oivzsbg3xuff',
  'jobName': 'human-eval-marketing-push-messages-v2',
  'status': 'InProgress',
  'creationTime': datetime.datetime(2024, 8, 6, 1, 35, 27, 386000, tzinfo=tzlocal()),
  'jobType': 'Human',
  'evaluationTaskTypes': ['Generation'],
  'modelIdentifiers': ['arn:aws:bedrock:us-east-1::foundation-model/mistral.mixtral-8x7b-instruct-v0:1']}]

In [None]:
response['jobSummaries']

In [None]:
# 3e. Stop a given evaluation job if needed
response2 = client.stop_evaluation_job(
    jobIdentifier='XXX'
)
print(response2)