In [1]:
from dotenv import load_dotenv
import openai
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

AZURE_COMPUTER_VISION_ENDPOINT = os.getenv("AZURE_COMPUTER_VISION_ENDPOINT")
AZURE_COMPUTER_VISION_KEY = os.getenv("AZURE_COMPUTER_VISION_KEY")


In [2]:
import os
import azure.ai.vision as visionsdk


def analyze_image(image_filename):
    service_options = visionsdk.VisionServiceOptions(AZURE_COMPUTER_VISION_ENDPOINT, AZURE_COMPUTER_VISION_KEY)

    # Specify the image file on disk to analyze. sample.jpg is a good example to show most features
    # vision_source = visionsdk.VisionSource(filename="sample.jpg")

    # Or, instead of the above, specify a publicly accessible image URL to analyze. For example:
    vision_source = visionsdk.VisionSource(filename=image_filename)

    analysis_options = visionsdk.ImageAnalysisOptions()

    # Mandatory. You must set one or more features to analyze. Here we use the full set of features.
    # Note that "CAPTION" and "DENSE_CAPTIONS" are only supported in Azure GPU regions (East US, France Central,
    # Korea Central, North Europe, Southeast Asia, West Europe, West US). Remove "CAPTION" and "DENSE_CAPTIONS"
    # from the list below if your Computer Vision key is not from one of those regions.
    analysis_options.features = (
        # visionsdk.ImageAnalysisFeature.CROP_SUGGESTIONS |
        visionsdk.ImageAnalysisFeature.CAPTION |
        visionsdk.ImageAnalysisFeature.DENSE_CAPTIONS |
        visionsdk.ImageAnalysisFeature.OBJECTS |
        visionsdk.ImageAnalysisFeature.PEOPLE |
        visionsdk.ImageAnalysisFeature.TEXT |
        visionsdk.ImageAnalysisFeature.TAGS
    )

    # Optional, and only relevant when you select ImageAnalysisFeature.CROP_SUGGESTIONS.
    # Define one or more aspect ratios for the desired cropping. Each aspect ratio needs
    # to be in the range [0.75, 1.8]. If you do not set this, the service will return one
    # crop suggestion with the aspect ratio it sees fit.
    # analysis_options.cropping_aspect_ratios = [0.9, 1.33]

    # Optional. Default is "en" for English. See https://aka.ms/cv-languages for a list of supported
    # language codes and which visual features are supported for each language.
    analysis_options.language = "en"
    analysis_options.model_version = "latest"
    # Set this to "true" to get a gender neutral caption (the default is "false").
    analysis_options.gender_neutral_caption = True

    # Create the image analyzer object
    image_analyzer = visionsdk.ImageAnalyzer(service_options, vision_source, analysis_options)

    # This call creates the network connection and blocks until Image Analysis results
    # return (or an error occurred). Note that there is also an asynchronous (non-blocking)
    # version of this method: image_analyzer.analyze_async().
    result = image_analyzer.analyze()

    # Checks result.
    if result.reason == visionsdk.ImageAnalysisResultReason.ANALYZED:

        print(" Image height: {}".format(result.image_height))
        print(" Image width: {}".format(result.image_width))
        print(" Model version: {}".format(result.model_version))

        if result.caption is not None:
            print(" Caption:")
            print("   '{}', Confidence {:.4f}".format(result.caption.content, result.caption.confidence))

        if result.dense_captions is not None:
            print(" Dense Captions:")
            for caption in result.dense_captions:
                print("   '{}', {}, Confidence: {:.4f}".format(caption.content, caption.bounding_box, caption.confidence))

        if result.objects is not None:
            print(" Objects:")
            for object in result.objects:
                print("   '{}', {}, Confidence: {:.4f}".format(object.name, object.bounding_box, object.confidence))

        if result.tags is not None:
            print(" Tags:")
            for tag in result.tags:
                print("   '{}', Confidence {:.4f}".format(tag.name, tag.confidence))

        if result.people is not None:
            print(" People:")
            for person in result.people:
                print("   {}, Confidence {:.4f}".format(person.bounding_box, person.confidence))

        if result.crop_suggestions is not None:
            print(" Crop Suggestions:")
            for crop_suggestion in result.crop_suggestions:
                print("   Aspect ratio {}: Crop suggestion {}"
                      .format(crop_suggestion.aspect_ratio, crop_suggestion.bounding_box))

        if result.text is not None:
            print(" Text:")
            for line in result.text.lines:
                points_string = "{" + ", ".join([str(int(point)) for point in line.bounding_polygon]) + "}"
                print("   Line: '{}', Bounding polygon {}".format(line.content, points_string))
                for word in line.words:
                    points_string = "{" + ", ".join([str(int(point)) for point in word.bounding_polygon]) + "}"
                    print("     Word: '{}', Bounding polygon {}, Confidence {:.4f}"
                          .format(word.content, points_string, word.confidence))

        result_details = visionsdk.ImageAnalysisResultDetails.from_result(result)
        print(" Result details:")
        print("   Image ID: {}".format(result_details.image_id))
        print("   Result ID: {}".format(result_details.result_id))
        print("   Connection URL: {}".format(result_details.connection_url))
        print("   JSON result: {}".format(result_details.json_result))

    else:
        error_details = visionsdk.ImageAnalysisErrorDetails.from_result(result)
        print(" Analysis failed.")
        print("   Error reason: {}".format(error_details.reason))
        print("   Error code: {}".format(error_details.error_code))
        print("   Error message: {}".format(error_details.message))
        print(" Did you set the computer vision endpoint and key?")

    return result_details.json_result

#### Analyze a picture using Azure Cognitve services to extract text from a picture

In [3]:
image_url = "./data/retail/bike.jpg"
json_result = analyze_image(image_url)

 Image height: 224
 Image width: 224
 Model version: 2023-02-01-preview
 Caption:
   'a blue and pink polo shirt', Confidence 0.7036
 Dense Captions:
   'a blue and pink polo shirt', Rectangle(x=0, y=0, w=224, h=224), Confidence: 0.7118
   'a blue shirt with purple and pink design', Rectangle(x=4, y=17, w=140, h=181), Confidence: 0.6391
   'a blue shirt with a tie', Rectangle(x=107, y=19, w=107, h=182), Confidence: 0.6826
 Objects:
 Tags:
   'clothing', Confidence 0.9994
   'fabric', Confidence 0.9067
   'sleeve', Confidence 0.9034
   'collar', Confidence 0.9000
   'active shirt', Confidence 0.8928
   'top', Confidence 0.8923
   'shirt', Confidence 0.8135
   'person', Confidence 0.6144
 People:
   Rectangle(x=103, y=0, w=113, h=213), Confidence 0.0026
   Rectangle(x=0, y=0, w=63, h=223), Confidence 0.0022
   Rectangle(x=0, y=0, w=162, h=221), Confidence 0.0015
   Rectangle(x=166, y=0, w=57, h=223), Confidence 0.0014
 Text:
 Result details:
   Image ID: ./data/retail/bike.jpg
   Result 

#### Extract all dense captions from the json result

In [15]:
import json
dict = json.loads(json_result)
dense_captions = dict["denseCaptionsResult"]["values"]


#### Generate a product description from the text extracted from the photo using OpenAI

In [9]:
openai.api_type = "azure"
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_version = "2023-07-01-preview"
openai.api_key = OPENAI_API_KEY

In [11]:
def call_openAI(text):
    message_text = [
    {"role":"system","content":"You are an AI assistant that generates product descriptions for an online retail website especializing in clothing. Given the user's input, generate a Product Description."},
    {"role":"user","content":text}]
   
    completion = openai.ChatCompletion.create(
    engine=OPENAI_DEPLOYMENT_NAME,
    messages = message_text,
    temperature=0.7,
    max_tokens=800,
    top_p=0.95,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
    )
    return completion.choices[0].message.content

In [18]:
count = 1
for caption in dense_captions:
    result = call_openAI(caption["text"])
    print("Option " + str(count) + ": " + result)
    count = count + 1


Option 1: Introducing our long green dress with a black belt - the perfect addition to your wardrobe for any special occasion! This dress features a stunning emerald green color that is sure to turn heads. The black belt cinches at the waist, creating a flattering silhouette that will make you feel confident and beautiful. The dress is made with high-quality materials, ensuring both comfort and durability. Whether you're attending a formal event or a night out with friends, this dress is sure to impress. Don't hesitate, add this gorgeous green dress to your collection today!
Option 2: Introducing our stylish green dress with long sleeves, perfect for any occasion. Made with high-quality materials, this dress is both comfortable and elegant. The vibrant green color adds a pop of color to your wardrobe, while the long sleeves provide coverage and warmth on cooler days. The dress features a flattering silhouette that hugs your curves in all the right places, while the long sleeves create 