In [1]:
import json
import openai
import pandas as pd

In [2]:
from credentials import gpt_key

In [3]:
client = openai.OpenAI(api_key=gpt_key)

In [4]:
label_df = pd.read_csv("subtask2_all_fine-grained_narratives.csv")

In [5]:
label_df[["top", "mid", "low"]] = label_df["narrative"].str.split(": ",expand=True)

In [6]:
label_df

Unnamed: 0,narrative,top,mid,low
0,CC: Amplifying Climate Fears: Amplifying exist...,CC,Amplifying Climate Fears,Amplifying existing fears of global warming
1,CC: Amplifying Climate Fears: Doomsday scenari...,CC,Amplifying Climate Fears,Doomsday scenarios for humans
2,CC: Amplifying Climate Fears: Earth will be un...,CC,Amplifying Climate Fears,Earth will be uninhabitable soon
3,CC: Amplifying Climate Fears: Whatever we do i...,CC,Amplifying Climate Fears,Whatever we do it is already too late
4,CC: Amplifying Climate Fears: Other,CC,Amplifying Climate Fears,Other
...,...,...,...,...
91,URW: Speculating war outcomes: Russian army is...,URW,Speculating war outcomes,Russian army is collapsing
92,URW: Speculating war outcomes: Russian army wi...,URW,Speculating war outcomes,Russian army will lose all the occupied territ...
93,URW: Speculating war outcomes: Ukrainian army ...,URW,Speculating war outcomes,Ukrainian army is collapsing
94,URW: Speculating war outcomes: Other,URW,Speculating war outcomes,Other


In [7]:


def df_to_nested_json(df):
    # Create a nested dictionary structure
    result = {}
    
    # First level grouping by col1
    for col1_val in df['top'].unique():
        # Get all rows for this col1 value
        col1_group = df[df['top'] == col1_val]
        result[col1_val] = {}
        
        # Second level grouping by col2
        for col2_val in col1_group['mid'].unique():
            # Get all col3 values for this col2
            col3_values = col1_group[col1_group['mid'] == col2_val]['low'].tolist()
            result[col1_val][col2_val] = col3_values
    
    return result

json_labels = df_to_nested_json(label_df)

In [8]:
len(json_labels["URW"])

11

In [9]:
with open("json_labels.json","w") as f:
    json.dump(json_labels,f)

# Load all labels

In [5]:
with open("json_labels.json","r") as f:
    json_labels = json.load(f)

In [6]:


print(len(json_labels))
len(json_labels["URW"])

3


11

In [15]:
import tiktoken

In [21]:


def count_tokens(text: str, model: str = "gpt-4") -> int:
    # Get the correct tokenizer for GPT-4
    encoding = tiktoken.encoding_for_model(model)
    # Encode the text and count tokens
    return len(encoding.encode(text))

# Example usage
# text = "Your input text goes here."
token_count = count_tokens(text)
print(f"Token count: {token_count}")

Token count: 884


In [17]:
dev_df = pd.read_csv("dev.csv")

In [20]:
text = dev_df.iloc[1]["text"]

In [22]:
top_classes = list(json_labels.keys())
    
    # Create list of middle classes for each top class
middle_class_options = {
        top: list(json_labels[top].keys())
        for top in top_classes
    }
    

In [24]:
middle_class_options

{'CC': ['Amplifying Climate Fears',
  'Climate change is beneficial',
  'Controversy about green technologies',
  'Criticism of climate movement',
  'Criticism of climate policies',
  'Criticism of institutions and authorities',
  'Downplaying climate change',
  'Green policies are geopolitical instruments',
  'Hidden plots by secret schemes of powerful groups',
  'Questioning the measurements and science'],
 'URW': ['Amplifying war-related fears',
  'Blaming the war on others rather than the invader',
  'Discrediting the West, Diplomacy',
  'Discrediting Ukraine',
  'Distrust towards Media',
  'Hidden plots by secret schemes of powerful groups',
  'Negative Consequences for the West',
  'Overpraising the West',
  'Praise of Russia',
  'Russia is the Victim',
  'Speculating war outcomes'],
 'Other': ['null']}

In [43]:
text = annotations.iloc[0]["text"]

In [44]:
text

'Ukraine\'s Minerals: What the West is Fighting For \n\nWashington “cannot afford” to allow Russia to achieve victory in the Ukraine conflict as this would mean losing direct access to vast mineral assets. That was the view of US Senator Lindsey Graham* in an interview with ‘Face the Nation’ on CBS News in June. “They’re sitting on 10 to $12 trillion of critical minerals in Ukraine. They could be the richest country in all of Europe," Graham said. "If we help Ukraine now, they can become the best business partner we ever dreamed of, that $10 to $12 trillion of critical mineral assets could be used by Ukraine and the West,” he added.Graham is treating Ukraine as a future colony of the US with his comments on the ownership of the country\'s natural resources, said Vladimir Dzhabarov, official of the Russian Federation Council. While "promising Ukrainians mountains of gold, Graham in fact considers it as a future American colony," Dzhabarov said. *Lindsey Graham is included on the list of

In [45]:
def classify_text(text: str, json_labels: dict)
        # Create prompts
    system_prompt = f"""You are a hierarchical classifier that categorizes input into three levels: top_class, middle_class, and low_class. Follow these rules:

    1. First, select a top_class from: {top_classes}

    2. After selecting a top_class, you'll get the valid middle_class options for that top_class
    3. After selecting a middle_class, you'll get the valid low_class options for that combination

    4. Respond with a JSON object containing exactly these fields: 
    - "top_class": selected top class
    - "middle_class": corresponding middle class
    - "low_class": corresponding low class

    5. Ensure class selections are consistent with the hierarchy
    6. Provide only the JSON object - no additional text"""

    user_prompt = f"""Based on this input, provide the classification. Here's how:

    1. Choose a top_class from: {top_classes}

    2. Once you select a top_class, here are the valid middle_class options:
    {chr(10).join([f'   For {top}: {middle_class_options[top]}' for top in top_classes])}

    3. The valid low_class options for each middle_class are:
    {chr(10).join([f'   For {mid}: {json_labels[top][mid]}' 
                for top in top_classes 
                for mid in json_labels[top]])}

    Input text: {text}

    Respond with only a JSON object containing top_class, middle_class, and low_class."""



    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", 
                "content": dedent(system_prompt)
            },
            {
                "role": "user", 
                "content": dedent(user_prompt)
            }
        ],

    response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "classifier",
                "schema": {
                    "type": "object",
                    "properties": {
                        "top_class": {"type": "string",},
                        "middle_class": {"type": "string",},
                        "low_class": {"type": "string",}
                    },
                    "required": ["top_class", "middle_class", "low_class"],
                    "additionalProperties": False
                },
                "strict": True
            }
        }
        )


In [47]:
user_prompt

'Based on this input, provide the classification. Here\'s how:\n\n1. Choose a top_class from: [\'CC\', \'URW\', \'Other\']\n\n2. Once you select a top_class, here are the valid middle_class options:\n   For CC: [\'Amplifying Climate Fears\', \'Climate change is beneficial\', \'Controversy about green technologies\', \'Criticism of climate movement\', \'Criticism of climate policies\', \'Criticism of institutions and authorities\', \'Downplaying climate change\', \'Green policies are geopolitical instruments\', \'Hidden plots by secret schemes of powerful groups\', \'Questioning the measurements and science\']\n   For URW: [\'Amplifying war-related fears\', \'Blaming the war on others rather than the invader\', \'Discrediting the West, Diplomacy\', \'Discrediting Ukraine\', \'Distrust towards Media\', \'Hidden plots by secret schemes of powerful groups\', \'Negative Consequences for the West\', \'Overpraising the West\', \'Praise of Russia\', \'Russia is the Victim\', \'Speculating war 

In [50]:
from textwrap import dedent

In [60]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system", 
            "content": dedent(system_prompt)
        },
        {
            "role": "user", 
            "content": dedent(user_prompt)
        }
    ],

response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "classifier",
            "schema": {
                "type": "object",
                "properties": {
                    "top_class": {"type": "string",},
                    "middle_class": {"type": "string",},
                    "low_class": {"type": "string",}
                },
                "required": ["top_class", "middle_class", "low_class"],
                "additionalProperties": False
            },
            "strict": True
        }
    }
    )

In [11]:
devset_hf = pd.read_csv("dev.csv")

In [12]:
annotations = pd.read_csv("subtask-2-annotations.txt", sep="\t", header=None, names=["file", "top", "mid"])

In [13]:

annotations

Unnamed: 0,file,top,mid
0,EN_UA_DEV_100012.txt,"URW: Discrediting the West, Diplomacy;URW: Dis...","URW: Discrediting the West, Diplomacy: The Wes..."
1,EN_CC_200053.txt,Other,Other
2,EN_CC_200040.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Other;CC: C...
3,EN_CC_200070.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...
4,EN_UA_DEV_100034.txt,URW: Overpraising the West,URW: Overpraising the West: The West belongs i...
5,EN_CC_200049.txt,CC: Questioning the measurements and science;C...,CC: Questioning the measurements and science: ...
6,EN_UA_DEV_100003.txt,Other,Other
7,EN_UA_DEV_100033.txt,URW: Speculating war outcomes,URW: Speculating war outcomes: Russian army is...
8,EN_CC_200036.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Ad hominem ...
9,EN_CC_200079.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...


In [14]:
annotations = annotations.merge(devset_hf[['file', 'text']], on='file', how='left')

In [16]:
annotations.to_csv("annotations.csv", index=False)

Unnamed: 0,file,text,CC: Amplifying Climate Fears,CC: Climate change is beneficial,CC: Controversy about green technologies,CC: Criticism of climate movement,CC: Criticism of climate policies,CC: Criticism of institutions and authorities,CC: Downplaying climate change,CC: Green policies are geopolitical instruments,...,"URW: Discrediting the West, Diplomacy",URW: Discrediting Ukraine,URW: Distrust towards Media,URW: Hidden plots by secret schemes of powerful groups,URW: Negative Consequences for the West,URW: Overpraising the West,URW: Praise of Russia,URW: Russia is the Victim,URW: Speculating war outcomes,Other
0,EN_CC_200030.txt,"Bangladesh, Nordic countries to strengthen coo...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EN_CC_200033.txt,Greta Thunberg Calls For 'Overthrow of Whole C...,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,EN_CC_200034.txt,If we “just stop oil” like climate protesters ...,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,EN_CC_200035.txt,Gretchen Whitmer Orders Michigan’s State Fleet...,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,EN_CC_200036.txt,Climate cultists push bizarre scare language a...,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EN_CC_200040.txt,Climate Protesters Out Of Control As They Atta...,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6,EN_CC_200046.txt,CO2 is the GREENING molecule: New study shows ...,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,EN_CC_200047.txt,89% of ‘American Elites’ Back WEF’s Plan to Ra...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,EN_CC_200049.txt,Alarmists Warn of U.S. ‘Heat Dome’ Tied to Hum...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,EN_CC_200050.txt,Link to Major Banks Bend The Knee To Climate A...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [35]:
annotations

Unnamed: 0,id,top,mid,low
0,EN_UA_DEV_100012.txt,"URW: Discrediting the West, Diplomacy;URW: Dis...","URW: Discrediting the West, Diplomacy: The Wes...",
1,EN_CC_200053.txt,Other,Other,
2,EN_CC_200040.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Other;CC: C...,
3,EN_CC_200070.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,
4,EN_UA_DEV_100034.txt,URW: Overpraising the West,URW: Overpraising the West: The West belongs i...,
5,EN_CC_200049.txt,CC: Questioning the measurements and science;C...,CC: Questioning the measurements and science: ...,
6,EN_UA_DEV_100003.txt,Other,Other,
7,EN_UA_DEV_100033.txt,URW: Speculating war outcomes,URW: Speculating war outcomes: Russian army is...,
8,EN_CC_200036.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Ad hominem ...,
9,EN_CC_200079.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,


In [67]:
annotations

Unnamed: 0,file,top,mid,text
0,EN_UA_DEV_100012.txt,"URW: Discrediting the West, Diplomacy;URW: Dis...","URW: Discrediting the West, Diplomacy: The Wes...",Ukraine's Minerals: What the West is Fighting ...
1,EN_CC_200053.txt,Other,Other,UK’s Sunak Reverses Decision to Skip COP27 Cli...
2,EN_CC_200040.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Other;CC: C...,Climate Protesters Out Of Control As They Atta...
3,EN_CC_200070.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,Wat? L.A. Mayor Garcetti Flies to Argentina to...
4,EN_UA_DEV_100034.txt,URW: Overpraising the West,URW: Overpraising the West: The West belongs i...,Opinion: Restructuring Ukrainian debt is a ste...
5,EN_CC_200049.txt,CC: Questioning the measurements and science;C...,CC: Questioning the measurements and science: ...,Alarmists Warn of U.S. ‘Heat Dome’ Tied to Hum...
6,EN_UA_DEV_100003.txt,Other,Other,"Medvedev: Russia Seeks More in Ukraine, 'Proba..."
7,EN_UA_DEV_100033.txt,URW: Speculating war outcomes,URW: Speculating war outcomes: Russian army is...,Wild Kremlin TV hosts threaten the U.S. with n...
8,EN_CC_200036.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Ad hominem ...,Climate cultists push bizarre scare language a...
9,EN_CC_200079.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,Klaus Schwab Wants To Ban People Washing Their...


In [74]:
from openai import OpenAI
import json
from typing import Dict

def classify_text(text: str, dict_labels: Dict) -> Dict:
    """
    Classify a single text input using GPT-4.
    
    Args:
        text: Input text to classify
        dict_labels: Dictionary containing the label hierarchy
        api_key: OpenAI API key
    
    Returns:
        Dictionary containing top_class, middle_class, and low_class
    """
    
    # Create dynamic prompt
    top_classes = list(dict_labels.keys())
    
    # Create list of middle classes for each top class
    middle_class_options = {
        top: list(dict_labels[top].keys())
        for top in top_classes
    }
    
    # Create prompts
    system_prompt = f"""You are a hierarchical classifier that categorizes input into three levels: top_class, middle_class, and low_class. Follow these rules:

1. First, select a top_class from: {top_classes}

2. After selecting a top_class, you'll get the valid middle_class options for that top_class
3. After selecting a middle_class, you'll get the valid low_class options for that combination

4. Respond with a JSON object containing exactly these fields: 
   - "top_class": selected top class
   - "middle_class": corresponding middle class
   - "low_class": corresponding low class

5. Ensure class selections are consistent with the hierarchy
6. Provide only the JSON object - no additional text"""

    user_prompt = f"""Based on this input, provide the classification. Here's how:

1. Choose a top_class from: {top_classes}

2. Once you select a top_class, here are the valid middle_class options:
{chr(10).join([f'   For {top}: {middle_class_options[top]}' for top in top_classes])}

3. The valid low_class options for each middle_class are:
{chr(10).join([f'   For {mid}: {dict_labels[top][mid]}' 
               for top in top_classes 
               for mid in dict_labels[top]])}

Input text: {text}

Respond with only a JSON object containing top_class, middle_class, and low_class."""

    # Make API call
    # response = client.chat.completions.create(
    #     model="gpt-4",
    #     messages=[
    #         {"role": "system", "content": system_prompt},
    #         {"role": "user", "content": user_prompt}
    #     ],
    #     response_format={"type": "json_object"},
    #     temperature=0
    # )
    

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", 
                "content": dedent(system_prompt)
            },
            {
                "role": "user", 
                "content": dedent(user_prompt)
            }
        ],

        response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "classifier",
                    "schema": {
                        "type": "object",
                        "properties": {
                            "top_class": {"type": "string",},
                            "middle_class": {"type": "string",},
                            "low_class": {"type": "string",}
                        },
                        "required": ["top_class", "middle_class", "low_class"],
                        "additionalProperties": False
                    },
                    "strict": True
                }
            }
        )

    return json.loads(response.choices[0].message.content)
    # Example usage indent=2))

In [None]:
all_res = []
for i, row in annotations.iterrows():
    text = row["text"]
    result = classify_text(
        text=text,
        dict_labels=json_labels,
    )
    all_res.append(result)
    # break
    


In [91]:
res_df = pd.DataFrame(all_res)

In [92]:
res_df

Unnamed: 0,top_class,middle_class,low_class
0,URW,"Discrediting the West, Diplomacy","The West does not care about Ukraine, only abo..."
1,CC,Criticism of climate policies,Climate policies are ineffective
2,CC,Criticism of climate movement,Climate movement is alarmist
3,CC,Criticism of climate policies,Climate policies are only for profit
4,URW,Discrediting Ukraine,Discrediting Ukrainian government and official...
5,CC,Downplaying climate change,Human activities do not impact climate change
6,URW,Praise of Russia,Praise of Russian military might
7,URW,Praise of Russia,Praise of Russian President Vladimir Putin
8,CC,Criticism of climate movement,Climate movement is alarmist
9,CC,Criticism of climate policies,Climate policies have negative impact on the e...


In [93]:
res_df["file"] = annotations["file"]

In [100]:
res_df

Unnamed: 0,file,narrative,subnarrative
0,EN_UA_DEV_100012.txt,"URW: Discrediting the West, Diplomacy","URW: Discrediting the West, Diplomacy: The Wes..."
1,EN_CC_200053.txt,CC: Criticism of climate policies,CC: Criticism of climate policies: Climate pol...
2,EN_CC_200040.txt,CC: Criticism of climate movement,CC: Criticism of climate movement: Climate mov...
3,EN_CC_200070.txt,CC: Criticism of climate policies,CC: Criticism of climate policies: Climate pol...
4,EN_UA_DEV_100034.txt,URW: Discrediting Ukraine,URW: Discrediting Ukraine: Discrediting Ukrain...
5,EN_CC_200049.txt,CC: Downplaying climate change,CC: Downplaying climate change: Human activiti...
6,EN_UA_DEV_100003.txt,URW: Praise of Russia,URW: Praise of Russia: Praise of Russian milit...
7,EN_UA_DEV_100033.txt,URW: Praise of Russia,URW: Praise of Russia: Praise of Russian Presi...
8,EN_CC_200036.txt,CC: Criticism of climate movement,CC: Criticism of climate movement: Climate mov...
9,EN_CC_200079.txt,CC: Criticism of climate policies,CC: Criticism of climate policies: Climate pol...


In [95]:
res_df["narrative"] = res_df["top_class"] + ": " + res_df["middle_class"]
res_df["subnarrative"] = res_df["narrative"] + ": " + res_df["low_class"]

In [96]:
res_df.drop(columns=["top_class", "middle_class", "low_class"], inplace=True)

In [97]:
res_df = res_df[["file", "narrative", "subnarrative"]]

In [98]:
res_df.to_csv("improved_prompt_res.txt", index=False,header=False, sep="\t")

# Batched classification

# Multi label

In [11]:

from openai import OpenAI
import json
from typing import Dict
from textwrap import dedent
import pandas as pd

In [12]:
from credentials import gpt_key

In [13]:
client = OpenAI(api_key=gpt_key)

In [14]:
with open("json_labels.json","r") as f:
    json_labels = json.load(f)

In [15]:
annotations = pd.read_csv("annotations.csv")

In [28]:
def classify_text(text: str, dict_labels: Dict) -> Dict:
    """
    Classify text with single top class and one or more middle/low classes.
    """
    
    # Prepare label information
    top_classes = list(dict_labels.keys())
    middle_class_options = {
        top: list(dict_labels[top].keys())
        for top in top_classes
    }

    system_prompt = f"""
    You are an expert hierarchical classifier that identifies the most relevant categories.

    Classification Structure:
    1. Top Level Classes: {top_classes}
       - Select exactly ONE top class
       - Choose the most relevant category

    2. Middle Level Classes for each top class:
    {chr(10).join([f'   For {top}: {middle_class_options[top]}' for top in top_classes])}
       - Identify one or more relevant middle classes within the chosen top class
       - Include additional middle classes only if strongly relevant

    3. Low Level Classes:
    {chr(10).join([f'   For {mid}: {dict_labels[top][mid]}' 
                   for top in top_classes 
                   for mid in dict_labels[top]])}
       - For each middle class, identify one or more relevant low-level classes
       - Include only clearly applicable subcategories
    """

    user_prompt = f"""
    Analyze this text to determine:
    1. The single most appropriate top-level class
    2. One or more relevant middle classes within that top class
    3. One or more relevant low classes for each middle class

    Text to classify:
    {text}
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": dedent(system_prompt)
            },
            {
                "role": "user",
                "content": dedent(user_prompt)
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "classifier",
                "schema": {
                    "type": "object",
                    "properties": {
                        "top_class": {"type": "string"},
                        "classifications": {
                            "type": "array",
                            "minItems": 1,
                            "items": {
                                "type": "object",
                                "properties": {
                                    "middle_class": {"type": "string"},
                                    "low_classes": {
                                        "type": "array",
                                        "minItems": 1,
                                        "items": {"type": "string"}
                                    }
                                },
                                "required": ["middle_class", "low_classes"],
                                "additionalProperties": False
                            }
                        }
                    },
                    "required": ["top_class", "classifications"],
                    "additionalProperties": False
                },
                "strict": True
            }
        }
    )
    
    return json.loads(response.choices[0].message.content)

# Example classification
result = classify_text(
    text="Example text that might have multiple relevant categories",
    dict_labels=json_labels,
)

print(json.dumps(result, indent=2))

BadRequestError: Error code: 400 - {'error': {'message': "Invalid schema for response_format 'classifier': In context=('properties', 'classifications'), 'minItems' is not permitted.", 'type': 'invalid_request_error', 'param': 'response_format', 'code': None}}

In [29]:
hierarchy_description = []
top_classes = list(json_labels.keys())
for top in top_classes:
    middles = json_labels[top]
    hierarchy_description.append(f"Top Class: {top}")
    hierarchy_description.append("  Middle Classes:")
    for mid, lows in middles.items():
        hierarchy_description.append(f"  - {mid}:")
        hierarchy_description.append(f"    Low Classes: {', '.join(lows)}")



In [31]:
hierarchy_description

['Top Class: CC',
 '  Middle Classes:',
 '  - Amplifying Climate Fears:',
 '    Low Classes: Amplifying existing fears of global warming, Doomsday scenarios for humans, Earth will be uninhabitable soon, Whatever we do it is already too late, Other',
 '  - Climate change is beneficial:',
 '    Low Classes: CO2 is beneficial, Temperature increase is beneficial, Other',
 '  - Controversy about green technologies:',
 '    Low Classes: Nuclear energy is not climate friendly, Renewable energy is costly, Renewable energy is dangerous, Renewable energy is unreliable, Other',
 '  - Criticism of climate movement:',
 '    Low Classes: Ad hominem attacks on key activists, Climate movement is alarmist, Climate movement is corrupt, Other',
 '  - Criticism of climate policies:',
 '    Low Classes: Climate policies are ineffective, Climate policies are only for profit, Climate policies have negative impact on the economy, Other',
 '  - Criticism of institutions and authorities:',
 '    Low Classes: Cr

In [19]:
from openai import OpenAI
import json
from typing import Dict, Union, List
from textwrap import dedent

def validate_classification(result: Dict, dict_labels: Dict) -> None:
    """Validate classification against the hierarchy structure"""
    top_class = result['top_class']
    if top_class not in dict_labels:
        raise ValueError(f"Invalid top_class: {top_class}. Must be one of {list(dict_labels.keys())}")
    
    middle_classes = dict_labels[top_class]
    for classification in result['classifications']:
        middle_class = classification['middle_class']
        if middle_class not in middle_classes:
            raise ValueError(f"Invalid middle_class: {middle_class} for top_class {top_class}. Must be one of {list(middle_classes.keys())}")
        
        valid_low_classes = middle_classes[middle_class]
        for low in classification['low_classes']:
            if low not in valid_low_classes:
                raise ValueError(f"Invalid low_class: {low} for middle_class {middle_class}. Must be one of {valid_low_classes}")

def classify_text(text: Union[str, List[str]], dict_labels: Dict) -> Union[Dict, List[Dict]]:
    """
    Classify text(s) with hierarchical structure validation and multi-label support
    """
    # client = OpenAI(api_key=api_key)
    top_classes = list(dict_labels.keys())
    
    # Build hierarchical description
    hierarchy_description = []
    for top in top_classes:
        middles = dict_labels[top]
        hierarchy_description.append(f"Top Class: {top}")
        hierarchy_description.append("  Middle Classes:")
        for mid, lows in middles.items():
            hierarchy_description.append(f"  - {mid}:")
            hierarchy_description.append(f"    Low Classes: {', '.join(lows)}")

    system_prompt = f"""
    You are a hierarchical classification expert. Follow these rules strictly:

    1. Top Level (Select ONE):
    {', '.join(top_classes)}

    2. Middle Level (Select ALL relevant within chosen top class)

    3. Low Level (Select ALL applicable for each middle class)

    Full Hierarchy:
    {chr(10).join(hierarchy_description)}

    Requirements:
    - Top class: Most specific single category
    - Middle classes: All applicable subcategories
    - Low classes: All relevant sub-subcategories
    - Maintain strict parent-child relationships
    - Use exact category names from the hierarchy
    - Include multiple labels where appropriate
    """

    if isinstance(text, list):
        return [classify_text(t, dict_labels, api_key) for t in text]

    user_prompt = f"""
    Analyze the text and provide hierarchical classification:

    Text: {text}

    Return JSON format with:
    - Exactly one top_class
    - All relevant middle_classes (minimum 1)
    - All applicable low_classes for each middle (minimum 1 per middle)
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": dedent(system_prompt)},
            {"role": "user", "content": dedent(user_prompt)}
        ],
        response_format={
            "type": "json_object",
            "schema": {
                "type": "object",
                "properties": {
                    "top_class": {"type": "string", "enum": top_classes},
                    "classifications": {
                        "type": "array",
                        "minItems": 1,
                        "items": {
                            "type": "object",
                            "properties": {
                                "middle_class": {"type": "string"},
                                "low_classes": {
                                    "type": "array",
                                    "minItems": 1,
                                    "items": {"type": "string"}
                                }
                            },
                            "required": ["middle_class", "low_classes"],
                            "additionalProperties": False
                        }
                    }
                },
                "required": ["top_class", "classifications"],
                "additionalProperties": False
            }
        }
    )
    
    result = json.loads(response.choices[0].message.content)
    validate_classification(result, dict_labels)
    return result

In [21]:
text = annotations.iloc[0]["text"]
result = classify_text(
        # text="Recent advances in quantum computing and AI security",
        text=text,
        dict_labels=json_labels,
      
    )
print(json.dumps(result, indent=2))

BadRequestError: Error code: 400 - {'error': {'message': "Unknown parameter: 'response_format.schema'.", 'type': 'invalid_request_error', 'param': 'response_format.schema', 'code': 'unknown_parameter'}}