In [3]:
# !pip install pandas

In [2]:
# !pip install openai

In [4]:
# !export OPENAI_API_KEY=

### Augment dataset with GPT API

In [2]:
import pandas as pd
from openai import OpenAI
import json
import os 
import numpy as np

In [3]:
df = pd.read_json('data/ontology.json')[['id', 'name', 'description']]
df

In [5]:
def query_openai(sound_name, sound_description):
    """
    Use openai to classify sounds as sound effects or ambience.
    """
    client = OpenAI(api_key =os.environ.get('OPENAI_API_KEY'))
    
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-0125",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": "You are a helpful assistant designed to output JSON. Given the name of the sound and its description, you classify it as ambience (code: AMB) or sound effect (code: SFX). If it's a sound effect, you also include the object emitter. The schema of the JSON is always {'name': ---, 'type': --, 'object(s)': ---}"},
        {"role": "user", "content": "A sound effect is a sound that is directly emitted by an individual or by an object. For example, the sound of a car engine or a child screaming are sound effects. In contrast, an ambience sound cannot be directly attributed to an object, and it depends by the location. Examples of ambience sounds include a people talking in a busy street, a cafe, a hotel lobby etc."},
        {"role": "user", "content": f'Is this sound an ambience sound or sound effect? Name: {sound_name}. Description: {sound_description}.'}
          
      ]
    )
    return json.loads(response.choices[0].message.content)

In [1]:
classes = {'name': [], 'type': [], 'object(s)': []}

names = df.name.unique()
descriptions = df.description.unique()

for i in range(len(df)):
    response = query_openai(names[i], descriptions[i])
    print(response)
    
    classes['name'].append(names[i])
    
    if 'type' in response.keys():
        classification = response['type']
        classes['type'].append(classification)
    else:
        classes['type'].append(response)
        
    if 'object(s)' in response.keys():
        object = response['object(s)']
        classes['object(s)'].append(object)
    else:
        classes['object(s)'].append(response)

In [8]:
classes_df_output_3 = pd.DataFrame(classes)
classes_df_output_3

Unnamed: 0,name,type,object(s)
0,Human sounds,SFX,individual
1,Human voice,SFX,Human
2,Speech,SFX,Human
3,"Male speech, man speaking",SFX,human
4,"Female speech, woman speaking",SFX,woman
...,...,...,...
627,Recording,SFX,recorder/player
628,Field recording,AMB,
629,Gramophone record,SFX,Gramophone or vinyl record disc on a turntable
630,Compact disc,SFX,digital audio Compact Disc


I ran this 3 times and collected a dataset of outputs. Now, we'll generate the values by majority/agreement

### Post-processing (Majority voting)

In [15]:
df = pd.read_csv('extracted2.csv', index_col=0)

df['object(s)_y'] = df['object(s)_y'].str.split("object").str[-1].str.split("(s)").str[-1].str.split("'").str[-2] # nevermind this lol

df

Type (SFX vs Ambience): Majority wins:

In [26]:
df['MajorityType'] = np.where((df['type_x'] == 'SFX').astype(int) + (df['type_y'] == 'SFX').astype(int) + (df['type'] == 'SFX').astype(int) >= 2, 'SFX', 'AMB')

In [28]:
df = df.drop(columns=['type_x','type_y','type'])
df

Unnamed: 0,name,object(s)_x,object(s)_y,object(s),MajorityType
0,Human sounds,human body,Human body,individual,SFX
1,Human voice,Human,Human,Human,SFX
2,Speech,Human,Human,Human,SFX
3,"Male speech, man speaking",mouth of the adult male human,Human,human,SFX
4,"Female speech, woman speaking",human,woman,woman,SFX
...,...,...,...,...,...
627,Recording,,Recording device,recorder/player,SFX
628,Field recording,,,,AMB
629,Gramophone record,gramophone or vinyl record disc,c on turntable,Gramophone or vinyl record disc on a turntable,SFX
630,Compact disc,Compact Disc,c,digital audio Compact Disc,SFX


Sound emitter (object): Ask again GPT to find agreement

In [34]:
def openai_find_agreement(object_x, object_y, object_z):
    """
    Use openai to classify sounds as sound effects or ambience.
    """
    client = OpenAI(api_key =os.environ.get('OPENAI_API_KEY'))
    
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-0125",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": "You are a helpful assistant designed to output JSON. The JSON must contain the 'object' field. "},
        {"role": "user", "content": f"Three people described an object in three different ways. First person: {object_x}. Second person: {object_y}. Third person:{object_z}. What is the object?"}
          
      ]
    )
    return json.loads(response.choices[0].message.content)

In [35]:
objects_agreement = []
for row in df.iterrows():
    agreement = openai_find_agreement(row[-1]['object(s)_x'], row[-1]['object(s)_y'], row[-1]['object(s)'])
    objects_agreement.append(agreement['object'])

In [36]:
df['Object'] = objects_agreement
df

Unnamed: 0,name,object(s)_x,object(s)_y,object(s),MajorityType,Object
0,Human sounds,human body,Human body,individual,SFX,human body
1,Human voice,Human,Human,Human,SFX,Human
2,Speech,Human,Human,Human,SFX,Human
3,"Male speech, man speaking",mouth of the adult male human,Human,human,SFX,An adult male human
4,"Female speech, woman speaking",human,woman,woman,SFX,woman
...,...,...,...,...,...,...
627,Recording,,Recording device,recorder/player,SFX,recorder/player
628,Field recording,,,,AMB,unknown
629,Gramophone record,gramophone or vinyl record disc,c on turntable,Gramophone or vinyl record disc on a turntable,SFX,gramophone or vinyl record disc on a turntable
630,Compact disc,Compact Disc,c,digital audio Compact Disc,SFX,Compact Disc


In [37]:
df = df[['name','MajorityType','Object']]

In [38]:
df

Unnamed: 0,name,MajorityType,Object
0,Human sounds,SFX,human body
1,Human voice,SFX,Human
2,Speech,SFX,Human
3,"Male speech, man speaking",SFX,An adult male human
4,"Female speech, woman speaking",SFX,woman
...,...,...,...
627,Recording,SFX,recorder/player
628,Field recording,AMB,unknown
629,Gramophone record,SFX,gramophone or vinyl record disc on a turntable
630,Compact disc,SFX,Compact Disc


In [39]:
df.to_csv('data/augmented_labels.csv')