## This Notebook demonstrates how we can consider Latency and Server Availability to optimize the usage of Azure OpenAI

### We are considering Azure Monitor Metrics for an Azure service and building up a logic to view the best available region to be used for a given user/subscription. This approach can be further automated and integrated as a Software Repo on GitHub to be used by customers in their applications.

This solution addresses the service's latency and availability with the monitoring metrics and then would implement a fallback mechanism to maintain the conversation context and use the most effective region to fetch the responses
1. It considers service's availability
2. Latency on the service
3. Maintains the conversation context

In [3]:
import os, time
import openai
import requests, urllib
import tiktoken, json
import numpy as np

In [156]:
# %pip install opencensus-ext-azure

In [25]:
api_key = "6dbfc25dd1834836b4ed30b56ea0e933"

openai.api_base = "https://openai-service4200.openai.azure.com/"
openai.api_key = api_key
openai.api_type = "azure"
openai.api_version = "2023-05-15"

In [35]:
# openai.Deployment.list()

In [40]:
'''
response = openai.ChatCompletion.create(
  engine="mychatgpt",
  messages = [{"role":"system","content":"You are an AI assistant that helps people find information."},{"role":"user","content":"Please help me with top 5 pizza stores in Bengaluru."}],
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None)
'''

In [53]:
# response['choices'][0]['message']['content'].replace('\n','\n ').strip()

"Sure! Here are the top 5 pizza stores in Bengaluru:\n \n 1. California Pizza Kitchen\n 2. Pizza Hut\n 3. Domino's Pizza\n 4. Joey's Pizza\n 5. Onesta Pizza\n \n I hope this helps! Let me know if you need any more assistance."

### Declaring the different regions for OAI

In [26]:
# Defining services for multiple regions

endpoints = ["https://openai-service4200.openai.azure.com/", "https://openai-scus4200.openai.azure.com/",
             "https://openai-ncus10.openai.azure.com/", "https://oai-frc4200.openai.azure.com/"]

keys = ["6dbfc25dd1834836b4ed30b56ea0e933", "4b706300ad4b479fbe58dd50120ff6be", 
        "612a5d8af3ae471782d4ef7a0d91ef87", "437edafaf120410bb0378cab1feb35fe"]

models = ["mychatgpt", "chatgpt", "mychatgpt", "mychatgpt"]

In [27]:
endpoints[1], keys[1], models[1]

('https://openai-scus4200.openai.azure.com/',
 '4b706300ad4b479fbe58dd50120ff6be',
 'chatgpt')

In [16]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = 0
    for message in messages:
        num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":  # if there's a name, the role is omitted
                num_tokens += -1  # role is always required and always 1 token
    num_tokens += 2  # every reply is primed with <im_start>assistant
    return num_tokens

In [17]:
system_message = 'Hey there, how can I help you?'

In [28]:
def send_message(messages, model_name, max_response_tokens=500):
    response = openai.ChatCompletion.create(
        engine=model_name,
        messages=messages,
        temperature=0.5,
        max_tokens=max_response_tokens,
        top_p=0.9,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response['choices'][0]['message']['content']

def print_conversation(messages):
    for message in messages:
        print(f"[{message['role'].upper()}]")
        print(message['content'])
        print()

In [29]:
# user_message = "I want to write a small blog post about the impact of AI on the future of work."
user_message = "I want to buy a tshirt with Avengers printed. Can you suggest some brands?"
# user_message = "Help me learn about OpenAI. Can you give me a couple of resources for the same?"

In [30]:
messages=[
    {"role": "system", "content": system_message},
    {"role": "user", "name":"manish", "content": user_message}
]

In [31]:
token_count = num_tokens_from_messages(messages)
print(token_count)

39


In [32]:
max_response_tokens = 500

# add = "Can you help me with some store in Gujarat?"
# messages.append({"role": "user", "content": add})
model_name = 'mychatgpt'

start = time.time()
response = send_message(messages, model_name, max_response_tokens)
end = time.time()
messages.append({"role": "assistant", "content": response})


print("--- %s seconds ---" % (end - start))
print(messages)

# print(messages)
# print_conversation(messages)

--- 3.562932014465332 seconds ---
[{'role': 'system', 'content': 'Hey there, how can I help you?'}, {'role': 'user', 'name': 'manish', 'content': 'I want to buy a tshirt with Avengers printed. Can you suggest some brands?'}, {'role': 'assistant', 'content': 'Sure, there are a lot of brands that sell Avengers printed t-shirts. Here are some popular ones:\n\n1. Marvel Official Merchandise\n2. Adidas\n3. Under Armour\n4. H&M\n5. Zara\n6. Forever 21\n7. Hot Topic\n8. SuperHeroStuff\n9. BoxLunch\n10. Target\n\nYou can check out their websites or visit their stores to find the perfect Avengers t-shirt for you.'}]


## Maintaining the Context and handling token limits effectively

In [86]:
overall_max_tokens = 4096
prompt_max_tokens = overall_max_tokens - max_response_tokens

In [88]:
user_message = "The target audience for the blog post should be business leaders working in the tech industry."
#user_message = "Let's talk about generative AI and keep the tone informational but also friendly."

messages.append({"role": "user", "content": user_message})

token_count = num_tokens_from_messages(messages)
print(f"Token count: {token_count}")

# remove first message while over the token limit
while token_count > prompt_max_tokens:
    messages.pop(0)
    token_count = num_tokens_from_messages(messages)

response = send_message(messages, model_name, max_response_tokens)

messages.append({"role": "assistant", "content": response})
print_conversation(messages)

Token count: 385
[SYSTEM]
Hey there, how can I help you?

[USER]
I want to write a blog post about the impact of AI on the future of work.

[ASSISTANT]
Great idea! The impact of AI on the future of work is a topic that has been widely discussed and researched. Here are some key points you could consider including in your blog post:

1. Automation: AI is expected to automate many routine and repetitive tasks, which could lead to job displacement for some workers. However, it could also create new job opportunities in fields such as data analysis, programming, and AI development.

2. Upskilling: As AI becomes more prevalent in the workplace, it will become increasingly important for workers to develop new skills and adapt to new technologies. This could lead to a greater focus on upskilling and lifelong learning.

3. Collaboration: AI is not expected to replace human workers entirely, but rather to work alongside them. This could lead to more collaborative work environments, where humans

### REST Implementation

In [152]:
# def rest_send_message(messages, model_name, max_response_tokens):
#     api_url = endpoints[0] + 'openai/deployments/'+model_name+'/chat/completions?api-version=2023-05-15'
    
#     headers =  {"Content-Type":"application/json", "api-key": keys[0]}
#     messages = json.dumps(messages)
    
#     response = requests.post(api_url, data=json.dumps(messages), headers=headers)
    
#     return response.json()

In [153]:
# user_message = "Can you help me fetch a pizza place in India?"
# # user_message = "The target audience for the blog post should be business leaders working in the tech industry."
# # user_message = "Let's talk about generative AI and keep the tone informational but also friendly."
# messages=[
#     {"role": "system", "content": system_message},
#     {"role": "user", "name":"manish", "content": user_message}
# ]

# # messages.append({"role": "user", "content": user_message})

# token_count = num_tokens_from_messages(messages)
# print(f"Token count: {token_count}")

# # remove first message while over the token limit
# while token_count > prompt_max_tokens:
#     messages.pop(0)
#     token_count = num_tokens_from_messages(messages)

# response = rest_send_message(messages, model_name, max_response_tokens)
# # print(response)
# messages.append({"role": "assistant", "content": response})
# print_conversation(messages)

## Latency

In [33]:
latency_values = []
start = time.time()
for endpoint in endpoints:
    service_name = endpoint.split('//')[1].split('.')[0]
#     print(service_name)
    url = "https://management.azure.com/subscriptions/ef245f14-9dca-41c6-ab0f-12e5afe4692b/resourceGroups/openai-rg/providers/Microsoft.CognitiveServices/accounts/"+service_name+"/providers/microsoft.insights/metrics?metricnames=Latency&api-version=2019-07-01&timespan=2023-09-17/2023-09-19&top=3&aggregation=Average&interval=PT1M"

    headers =  {"Content-Type":"application/json", "Authorization": "Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6Ii1LSTNROW5OUjdiUm9meG1lWm9YcWJIWkdldyIsImtpZCI6Ii1LSTNROW5OUjdiUm9meG1lWm9YcWJIWkdldyJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuYXp1cmUuY29tIiwiaXNzIjoiaHR0cHM6Ly9zdHMud2luZG93cy5uZXQvNzJmOTg4YmYtODZmMS00MWFmLTkxYWItMmQ3Y2QwMTFkYjQ3LyIsImlhdCI6MTY5NTAxODEwNywibmJmIjoxNjk1MDE4MTA3LCJleHAiOjE2OTUxMDQ4MDcsImFpbyI6IkUyRmdZTkJ1Q05FdldiWlc2bDdYcmg4cjR6NCtBUUE9IiwiYXBwaWQiOiI2ZmQxNzllOS01Nzg5LTRiNDAtYmVkMC0xZThkYWYzYTU0ZTAiLCJhcHBpZGFjciI6IjEiLCJpZHAiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC83MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDcvIiwiaWR0eXAiOiJhcHAiLCJvaWQiOiJiMjU2NjFjZS1kODljLTQ3ZWMtOTJiYi02ZjQ2MmRkNTJkZjkiLCJyaCI6IjAuQVFFQXY0ajVjdkdHcjBHUnF5MTgwQkhiUjBaSWYza0F1dGRQdWtQYXdmajJNQk1hQUFBLiIsInN1YiI6ImIyNTY2MWNlLWQ4OWMtNDdlYy05MmJiLTZmNDYyZGQ1MmRmOSIsInRpZCI6IjcyZjk4OGJmLTg2ZjEtNDFhZi05MWFiLTJkN2NkMDExZGI0NyIsInV0aSI6ImJoZnN5VlJjUEUtVFgyVmJ3ZDBMQUEiLCJ2ZXIiOiIxLjAiLCJ4bXNfdGNkdCI6MTI4OTI0MTU0N30.inp_wQQgsx4ixyc3U6I7yfJb67jFJrDghtHs_ol_7sq0W5TiT582qnYJpJ847qPER-MsXoH8j6FIUzn6l-0j7JG6v3oNBY49vQ0TPLhq5C1ogXuSAu_oG-K2jqAvrQ_-DeTfezkSbzxctg_TAP_67gt4iH4sfi1mkvUCQMJ9xIN-iGEjtM_hsj_ykK87aNGlChoWTPYx5I8cFI0dS5YB2y3F058VbIt0GtXa5ufLG_B2eYKCQFE1Td_BUybQulw2geROSByirTNAOrtZOzZwmhuYH23imFpYHAvhZM5RYxFcr1KiB8Ph2AOmVmRQmx_RfWfdKcp4YlFWGxpixDB9BA"}
    # messages = json.dumps(messages)

    response = requests.get(url, headers=headers)

    # print(response.json()['value'][0]['timeseries'][0]['data'])
    latencies=[]
    for items in response.json()['value'][0]['timeseries'][0]['data'][:]:
        if len(items)>1:
    #         print(items)
            latencies.append(items['average'])
#         np.round(np.average(latencies),2)
    print("For",service_name, "latency is", np.round(np.average(latencies)/1000,2),"sec")
    latency_values.append(str(np.round(np.average(latencies)/1000,2))+" seconds")
end = time.time()

# print(end - start)

For openai-service4200 latency is 3.18 sec
For openai-scus4200 latency is 3.58 sec
For openai-ncus10 latency is 4.23 sec
For oai-frc4200 latency is 8.6 sec


## Server Errors

In [265]:
start = time.time()
for endpoint in endpoints:
    service_name = endpoint.split('//')[1].split('.')[0]
#     print(service_name)
    url = "https://management.azure.com/subscriptions/ef245f14-9dca-41c6-ab0f-12e5afe4692b/resourceGroups/openai-rg/providers/Microsoft.CognitiveServices/accounts/"+service_name+"/providers/microsoft.insights/metrics?metricnames=ServerErrors&api-version=2019-07-01&timespan=2023-09-17/2023-09-19&top=3&aggregation=Average&interval=PT1H"

    headers =  {"Content-Type":"application/json", "Authorization": "Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6Ii1LSTNROW5OUjdiUm9meG1lWm9YcWJIWkdldyIsImtpZCI6Ii1LSTNROW5OUjdiUm9meG1lWm9YcWJIWkdldyJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuYXp1cmUuY29tIiwiaXNzIjoiaHR0cHM6Ly9zdHMud2luZG93cy5uZXQvNzJmOTg4YmYtODZmMS00MWFmLTkxYWItMmQ3Y2QwMTFkYjQ3LyIsImlhdCI6MTY5NTAxODEwNywibmJmIjoxNjk1MDE4MTA3LCJleHAiOjE2OTUxMDQ4MDcsImFpbyI6IkUyRmdZTkJ1Q05FdldiWlc2bDdYcmg4cjR6NCtBUUE9IiwiYXBwaWQiOiI2ZmQxNzllOS01Nzg5LTRiNDAtYmVkMC0xZThkYWYzYTU0ZTAiLCJhcHBpZGFjciI6IjEiLCJpZHAiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC83MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDcvIiwiaWR0eXAiOiJhcHAiLCJvaWQiOiJiMjU2NjFjZS1kODljLTQ3ZWMtOTJiYi02ZjQ2MmRkNTJkZjkiLCJyaCI6IjAuQVFFQXY0ajVjdkdHcjBHUnF5MTgwQkhiUjBaSWYza0F1dGRQdWtQYXdmajJNQk1hQUFBLiIsInN1YiI6ImIyNTY2MWNlLWQ4OWMtNDdlYy05MmJiLTZmNDYyZGQ1MmRmOSIsInRpZCI6IjcyZjk4OGJmLTg2ZjEtNDFhZi05MWFiLTJkN2NkMDExZGI0NyIsInV0aSI6ImJoZnN5VlJjUEUtVFgyVmJ3ZDBMQUEiLCJ2ZXIiOiIxLjAiLCJ4bXNfdGNkdCI6MTI4OTI0MTU0N30.inp_wQQgsx4ixyc3U6I7yfJb67jFJrDghtHs_ol_7sq0W5TiT582qnYJpJ847qPER-MsXoH8j6FIUzn6l-0j7JG6v3oNBY49vQ0TPLhq5C1ogXuSAu_oG-K2jqAvrQ_-DeTfezkSbzxctg_TAP_67gt4iH4sfi1mkvUCQMJ9xIN-iGEjtM_hsj_ykK87aNGlChoWTPYx5I8cFI0dS5YB2y3F058VbIt0GtXa5ufLG_B2eYKCQFE1Td_BUybQulw2geROSByirTNAOrtZOzZwmhuYH23imFpYHAvhZM5RYxFcr1KiB8Ph2AOmVmRQmx_RfWfdKcp4YlFWGxpixDB9BA"}
    # messages = json.dumps(messages)

    response = requests.get(url, headers=headers)

    # print(response.json()['value'][0]['timeseries'][0]['data'])
    latencies=[]
    for items in response.json()['value'][0]['timeseries'][0]['data'][:]:
        if items['average']>0:
    #         print(items)
            latencies.append(items['average'])
    print(np.average(latencies), "Server Errors")
end = time.time()

# print(end - start)

nan Server Errors
nan Server Errors
nan Server Errors
nan Server Errors


## Verifying the Results

In [37]:
output_time = []
for i in range(len(endpoints)):
    
    user_message = "Please give me a short recipe for making a Veggie Pizza."

    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "name":"manish", "content": user_message}
    ]

    service_name = endpoints[i].split('//')[1].split('.')[0]
    openai.api_base = endpoints[i]
    openai.api_key = keys[i]
    model_name = models[i]
    
    max_response_tokens = 500

    start = time.time()
    response = send_message(messages, model_name, max_response_tokens)
    end = time.time()
    messages.append({"role": "assistant", "content": response})


    print("--- %s seconds ---" % (end - start), "for", service_name)
    val = np.round(end - start,2)
    output_time.append(str(val) + ' seconds')

--- 5.634791135787964 seconds --- for openai-service4200
--- 5.188538312911987 seconds --- for openai-scus4200
--- 4.028377056121826 seconds --- for openai-ncus10
--- 11.607852458953857 seconds --- for oai-frc4200


In [38]:
latency_values, output_time

(['3.18 seconds', '3.58 seconds', '4.23 seconds', '8.6 seconds'],
 ['5.63 seconds', '5.19 seconds', '4.03 seconds', '11.61 seconds'])

In [39]:
regions = ["East US", "South Central US", "North Central US", "France Central"]

In [41]:
compare = []

for j in range(len(latency_values)):
    compare.append([endpoints[j].split('//')[1].split('.')[0], latency_values[j], output_time[j], regions[j]])

In [42]:
from tabulate import tabulate
 
# display table
head = ['Service Name', 'Regional Latency', 'Response Time for a prompt', 'Region']
print(tabulate(compare, headers=head, tablefmt='grid'))

+--------------------+--------------------+------------------------------+------------------+
| Service Name       | Regional Latency   | Response Time for a prompt   | Region           |
| openai-service4200 | 3.18 seconds       | 5.63 seconds                 | East US          |
+--------------------+--------------------+------------------------------+------------------+
| openai-scus4200    | 3.58 seconds       | 5.19 seconds                 | South Central US |
+--------------------+--------------------+------------------------------+------------------+
| openai-ncus10      | 4.23 seconds       | 4.03 seconds                 | North Central US |
+--------------------+--------------------+------------------------------+------------------+
| oai-frc4200        | 8.6 seconds        | 11.61 seconds                | France Central   |
+--------------------+--------------------+------------------------------+------------------+
