In [39]:
from openai import OpenAI
client = OpenAI()

COMMAND = """
Given a target and an interferer captions referring to an audio scene,
create a json with a key 'result',
whose value contains a list of 5 separation command options. A separation is itself a json containing
two keys, 'command' and 'type'. A separation command be one of 3 types, 'positive', 'negative' or 'mixed'.
For example:

Target: A woman talks nearby as water pours
Interferer: Dishes are clanging

Positive command example: Enhance the sound of the woman talking nearby and the water pouring
Negative command example: Remove the clanging dishes from the audio
Mixed command example: Keep the sound of the woman talking nearby and the water pouring, and exclude the clanging dishes from the mix
"""


def captions_to_command(target_caption, interferer_caption, client: OpenAI):
    return client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are an AI assistant working in the field of Language-Queried Audio Sound Separation (LASS)."},
        {"role": "user", "content": COMMAND},
        {"role": "user", "content": f"Target: {target_caption}"},
        {"role": "user", "content": f"Interferer: {interferer_caption}"},
      ],
      response_format = { "type": "json_object" }
    ).choices[0].message.content


TARGET_CAPTION = "A woman talks nearby as water pours"
INTERFERER_CAPTION = "A man screaming"
response = captions_to_command(TARGET_CAPTION, INTERFERER_CAPTION, client)

print(response)

{
    "result": [
        {
            "command": "Enhance the sound of the woman talking nearby and the water pouring while reducing the sound of the man screaming",
            "type": "mixed"
        },
        {
            "command": "Isolate the sound of the woman talking nearby and the water pouring",
            "type": "positive"
        },
        {
            "command": "Remove the sound of the man screaming from the audio",
            "type": "negative"
        },
        {
            "command": "Minimize the sound of the man screaming and focus on the woman talking nearby and the water pouring",
            "type": "mixed"
        },
        {
            "command": "Enhance the woman's voice and the water pouring sounds",
            "type": "positive"
        }
    ]
}


In [None]:
A man making a speech