In [5]:
PROMPT_TEMPLATE = """
You are an expert in propaganda analysis. Analyze the following text. 
     Always keep in mind that you are only looking for Russian propaganda, not any other type of propaganda. Any other type of propaganda should be ignored and labeled as 
     "No Propaganda"
    Text:
    "{text}"
      1. Identify the high  label that best describes the text.
    
      High level labels:
    
      Group 1: Patriotic & Catchy Appeals  
         (Appeal through patriotism, group identity, or memorable slogans.)  
          - Flag-waving  - appeal to patriotism or group identity.  
          - Slogans  - short, catchy phrases to promote an idea.  
    
      Group 2: Popularity Appeals  
         (Persuade by presenting an idea as popular or widely accepted.)  
          - Bandwagon  - argue something is good/true because it's popular.  
    
      Group 3: Deflections & Distractions  
         (Shift attention away from the issue through diversion or oversimplification.)  
          - Whataboutism  - deflect criticism by pointing to others' behavior.        
          - Repetition  - repeat a claim to make it stick.  
          - Causal oversimplification  - claim a simple cause for a complex issue.  
          - Red herring  - distract with irrelevant points.  
    
      Group 4: Emotional & Loaded Persuasion  
         (Exploit emotions or exaggeration to influence perception.)  
          - Loaded language  - emotionally charged wording.  
          - Reductio ad hitlerum  - dismiss by linking to Nazis/Hitler.  
          - Appeal to authority  - rely on authority instead of evidence.  
          - Appeal to fear/prejudice  - warn of frightening outcomes or biases.  
          - Name-calling/labeling  - attack with insulting labels.  
          - Exaggeration or minimization  - overstate or downplay effects.  
    
      Group 5: Argument Manipulations  
         (Manipulate reasoning by misrepresenting arguments or limiting choices.)  
          - Straw man  - misrepresent opponent's argument.  
          - Thought-terminating clichés  - shut down debate with stock phrases.  
          - Doubt  - question credibility or motives.        
          - Black-and-white fallacy  - present only two options.  
    
      Group 6: No Propaganda  
         (Text contains no relevant propaganda techniques.)  
          - no propaganda  - no relevant propaganda detected
    
      Now return the labels in a JSON format with the following structure: {{"high": integer}}
    
      Follow the json format strictly. Do not add any additional text or explanations. Just use the integers for labeling, no strings.
     Remember to always return Integers only! Only return one integer that best describes the text. Never return multiple integers or a list of integers!
"""

In [6]:
from openai import OpenAI
import json
import re
import pandas as pd

client = OpenAI()

def analyze_propaganda(text_to_analyze, max_retries=3):
    """Call OpenAI to get the 'high' integer label for a text."""
    if not text_to_analyze.strip():
        return None, "Empty input"

    try:
        full_prompt = PROMPT_TEMPLATE.format(text=text_to_analyze)
    except Exception as e:
        return None, f"Prompt error: {e}"

    last_model_output = None

    for attempt in range(1, max_retries + 1):
        try:
            response = client.chat.completions.create(
                model="gpt-4.1-nano",
                messages=[{"role": "user", "content": full_prompt}],
                temperature=0,
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "propaganda_schema",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "high": {"type": "integer"}
                            },
                            "required": ["high"]
                        }
                    }
                },
                max_tokens=64,
            )

            model_output_text = response.choices[0].message.content.strip()
            last_model_output = model_output_text

            json_match = re.search(r'\{(?:.|\n)*\}', model_output_text)
            if not json_match:
                raise ValueError("No JSON object found in model output.")

            parsed_json = json.loads(json_match.group(0).strip("` \n"))
            high_label = parsed_json.get("high")

            if not isinstance(high_label, int):
                raise ValueError(f"Label must be integer, got: high={high_label}")

            return high_label, None

        except (json.JSONDecodeError, ValueError, KeyError) as e:
            if attempt == max_retries:
                return None, f"Parsing failed after {max_retries} attempts: {e}\nRaw output: {last_model_output}"
        except Exception as e:
            if attempt == max_retries:
                return None, f"Unexpected error after {max_retries} attempts: {e}\nRaw output: {last_model_output}"

    return None, "Unknown error"


def process_pickle(input_pickle_path, output_json_path):
    """Process a pickle file and return 'high' labels for each text."""
    try:
        df = pd.read_pickle(input_pickle_path)
        print(f"✅ Loaded pickle '{input_pickle_path}' with {len(df)} rows.")
    except Exception as e:
        print(f"❌ Error loading pickle: {e}")
        return

    if not {"text", "id"}.issubset(df.columns):
        print("❌ DataFrame must contain both 'text' and 'id' columns.")
        return

    processed_data = []

    for i, row in df.iterrows():
        record_id = row["id"]
        text = str(row["text"]).strip()

        print(f"\n🔍 Processing record {i + 1} (ID: {record_id})...")
        high_label, error_message = analyze_propaganda(text)

        record = {
            "id": record_id,
            "text": text,
            "high": high_label,
            "error": error_message
        }

        processed_data.append(record)

    try:
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(processed_data, f, ensure_ascii=False, indent=4)
        print(f"\n✅ Saved labeled results to '{output_json_path}'")
    except Exception as e:
        print(f"❌ Error saving JSON: {e}")

In [8]:
if __name__ == "__main__":
    input_pickle_file = r"C:\Users\david\PY\golden_test_set_martino_extended_integer.pkl"
    output_json_file = r"C:\Users\david\PY\labeled_data_nano41_baseline_martino_highonly.json"
    process_pickle(input_pickle_file, output_json_file)
    

✅ Loaded pickle 'C:\Users\david\PY\golden_test_set_martino_extended_integer.pkl' with 200 rows.

🔍 Processing record 1 (ID: 1560717328412823552)...

🔍 Processing record 2 (ID: 1498038424409817089)...

🔍 Processing record 3 (ID: 1419340068314128385)...

🔍 Processing record 4 (ID: 1512171601340497926)...

🔍 Processing record 5 (ID: 1481806570723885061)...

🔍 Processing record 6 (ID: 1499383587732221952)...

🔍 Processing record 7 (ID: 1562035778389155840)...

🔍 Processing record 8 (ID: 1498936437298843654)...

🔍 Processing record 9 (ID: 1533124506767593474)...

🔍 Processing record 10 (ID: 1567081358526234625)...

🔍 Processing record 11 (ID: 1501934846913908745)...

🔍 Processing record 12 (ID: 1508099913011191808)...

🔍 Processing record 13 (ID: 1394817473845895170)...

🔍 Processing record 14 (ID: 1560395750550421505)...

🔍 Processing record 15 (ID: 1497576574186577928)...

🔍 Processing record 16 (ID: 1510632099937722369)...

🔍 Processing record 17 (ID: 1513156392739352577)...

🔍 Processin