## Text analytics for DEI stance & purchase intentions

### 1. Import 'combined_sentiment_annotations.csv'

In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
import re
from sklearn.metrics import cohen_kappa_score
import os

In [2]:
# Define file path relative to the notebook or project root
csv_file_path = '../../data/annotate/complete/combined_sentiment_annotations.csv'

# Load the CSV file
df = pd.read_csv(csv_file_path)
print(f"Successfully loaded {csv_file_path}")
print(f"DataFrame shape: {df.shape}")
print("First 5 rows:")
df.head(6)

Successfully loaded ../../data/annotate/complete/combined_sentiment_annotations.csv
DataFrame shape: (1000, 25)
First 5 rows:


Unnamed: 0,company_name,post_date,id,parent_id,comment_text,comment_date,comment_type,reaction_count,before_DEI,has_DEI,...,full_text,relevance,stance_dei_reviewer_01,purchase_intention_reviewer_01,stance_dei_reviewer_02,purchase_intention_reviewer_02,stance_dei_average,purchase_intention_average,stance_dei_label,purchase_intention_label
0,Target,2/2/25 11:06,Y29tbWVudDoxMTcxMzM2NDkxMDE4NDA3XzExMzM1OTk4OT...,,The audacity,2/7/25,initial,0,0,0,...,the audacity,0,0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0.0,0.0,0,0
1,Costco,1/13/25 11:00,Y29tbWVudDoxMDIwNjM1NjYzNDI3OTMwXzEyNjM0Njg2MD...,,"Yummmmm.,",1/17/25,initial,0,1,1,...,"yummmmm.,",0,0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0.0,0.0,0,0
2,Costco,2/9/25 9:00,Y29tbWVudDoxMDM5NjIyNDU0ODYyNTg0XzM2OTUyNzExMj...,,How much,2/16/25,initial,0,0,1,...,how much,0,0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0.0,0.0,0,0
3,Costco,2/16/25 9:00,Y29tbWVudDoxMDQ1NjAzNjA0MjY0NDY5XzkwODU0Mjg4ND...,,That looks so good!!! It also looks like the l...,2/23/25,initial,1,0,1,...,that looks so good!!! it also looks like the l...,0,0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0.0,0.0,0,0
4,Delta,2/13/25 9:30,Y29tbWVudDoxMDIxMzAxOTAwMDE4NzIyXzIwMDQ2MzM4Mj...,,I have a friend who wanted to be a pilot for D...,2/15/25,initial,0,0,1,...,i have a friend who wanted to be a pilot for d...,0,0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0.0,0.0,0,0
5,Costco,2/11/25 9:00,Y29tbWVudDoxMDQxNzYyOTI3OTgxODcwXzExNTU4NjE1OT...,Y29tbWVudDoxMDQxNzYyOTI3OTgxODcwXzgyODIwNTI2Mj...,another person who doesn’t understand DEI,2/16/25,reply,19,0,1,...,how much money are your executives getting to ...,1,0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0 (Neutral/Unclear towards DEI),0 (Neutral/Unclear/No PI),0.0,0.0,0,0


In [3]:
# Function to extract numeric score from label string (handles potential errors)
def extract_score(label):
    if pd.isna(label) or not isinstance(label, str):
        return np.nan
    # Match integers (positive, negative, or zero) at the beginning of the string
    match = re.match(r"^(-?\d+)", label)
    if match:
        return int(match.group(1))
    return np.nan # Return NaN if no number is found at the beginning

reviewer_cols = [
    'stance_dei_reviewer_01', 'purchase_intention_reviewer_01',
    'stance_dei_reviewer_02', 'purchase_intention_reviewer_02'
    ]

# Check if columns exist before processing
missing_cols = [col for col in reviewer_cols if col not in df.columns]

df['stance_dei_score_r1'] = df['stance_dei_reviewer_01'].apply(extract_score)
df['pi_score_r1'] = df['purchase_intention_reviewer_01'].apply(extract_score)
df['stance_dei_score_r2'] = df['stance_dei_reviewer_02'].apply(extract_score)
df['pi_score_r2'] = df['purchase_intention_reviewer_02'].apply(extract_score)

In [4]:
final_label_cols = ['stance_dei_label', 'purchase_intention_label']

print("Counts for final 'stance_dei_label':")
dei_counts = df['stance_dei_label'].value_counts(dropna=False) # Include NaNs if any
print(dei_counts)

print("\n")

print("Counts for final 'purchase_intention_label':")
pi_counts = df['purchase_intention_label'].value_counts(dropna=False) # Include NaNs if any
print(pi_counts)

Counts for final 'stance_dei_label':
stance_dei_label
 0    800
 1    123
-1     77
Name: count, dtype: int64


Counts for final 'purchase_intention_label':
purchase_intention_label
 0    838
-1     92
 1     70
Name: count, dtype: int64


### 2. Calculate Cohen's Kappa for dual-coded annotations

In [5]:
df_kappa_dei = df[['stance_dei_score_r1', 'stance_dei_score_r2']].dropna()
print(f"Calculating Kappa for DEI Stance using {len(df_kappa_dei)} complete pairs of ratings.")

kappa_dei = cohen_kappa_score(df_kappa_dei['stance_dei_score_r1'], df_kappa_dei['stance_dei_score_r2'])
print(f"Cohen's Kappa for DEI Stance: {kappa_dei:.4f}")

print("\n")

df_kappa_pi = df[['pi_score_r1', 'pi_score_r2']].dropna()
print(f"Calculating Kappa for Purchase Intention using {len(df_kappa_pi)} complete pairs of ratings.")

kappa_pi = cohen_kappa_score(df_kappa_pi['pi_score_r1'], df_kappa_pi['pi_score_r2'])
print(f"Cohen's Kappa for Purchase Intention: {kappa_pi:.4f}")

Calculating Kappa for DEI Stance using 1000 complete pairs of ratings.
Cohen's Kappa for DEI Stance: 0.8572


Calculating Kappa for Purchase Intention using 1000 complete pairs of ratings.
Cohen's Kappa for Purchase Intention: 0.8067


### 3. Prepare Labels for Imbalance Handling

In [6]:
# Map labels from [-1, 0, 1] to [0, 1, 2] for compatibility with loss functions
y_stance_original = df['stance_dei_label'].astype(int).values
y_pi_original = df['purchase_intention_label'].astype(int).values

# Stance: -1 (Anti) -> 0, 0 (Neutral) -> 1, 1 (Pro) -> 2
y_stance_idx = y_stance_original + 1

# PI: -1 (Boycott) -> 0, 0 (Neutral) -> 1, 1 (Buy) -> 2
y_pi_idx = y_pi_original + 1

print(f"Example DEI stance indices: {y_stance_idx[:10]}")
print(f"Example PI indices: {y_pi_idx[:10]}")

# Define class names for reporting
stance_class_names = ["anti", "neutral", "pro"]
pi_class_names = ["boycott", "neutral", "buy"]

print("\n")

# Ensure classes are correctly identified (0, 1, 2)
stance_classes = np.unique(y_stance_idx) # Should be [0, 1, 2]
print(f"Unique stance classes found for weighting: {stance_classes}")
pi_classes = np.unique(y_pi_idx) # Should be [0, 1, 2]      
print(f"Unique PI classes found for weighting: {pi_classes}")

Example DEI stance indices: [1 1 1 1 1 1 1 1 1 2]
Example PI indices: [1 1 1 1 1 1 1 1 0 0]


Unique stance classes found for weighting: [0 1 2]
Unique PI classes found for weighting: [0 1 2]


In [7]:
# Let's define our PyTorch Device (I am using MPS for Apple Silicon on Macbook M3 Pro)
import torch

if torch.backends.mps.is_available():
    # Check if MPS is available
    device = torch.device("mps")
    print("MPS backend is available. Using MPS device.")
elif not torch.backends.mps.is_built():
    # Check if MPS is built (required for is_available to be True)
    # This case is unlikely if is_available() is False, but good to be explicit
    device = torch.device("cpu")
    print("MPS not available because the current PyTorch install was not built with MPS enabled.")
else:
    # MPS is built but not available (e.g., OS version issue, though unlikely on modern macOS)
    device = torch.device("cpu")
    print("MPS not available. Using CPU device.")

print(f"Selected device: {device}")

MPS backend is available. Using MPS device.
Selected device: mps


In [8]:
# Now we are going to do some weighted random sampling 
from torch.utils.data import DataLoader, WeightedRandomSampler

# Compute sample weights for Stance DEI (one weight per example)
stance_label_counts = {label: count for label, count in zip(*np.unique(y_stance_idx, return_counts=True))}
print(f"Stance Label Counts: {stance_label_counts}")

# Avoid division by zero if a class somehow has 0 counts (shouldn't happen with unique)
stance_sample_weights = [1.0 / stance_label_counts[label] if stance_label_counts[label] > 0 else 0 for label in y_stance_idx]

# Create the sampler
stance_sampler = WeightedRandomSampler(stance_sample_weights, num_samples=len(stance_sample_weights), replacement=True)
print("Created WeightedRandomSampler for Stance DEI.")

print("\n")

# Compute sample weights for Purchase Intention
pi_label_counts = {label: count for label, count in zip(*np.unique(y_pi_idx, return_counts=True))}
print(f"PI Label Counts: {pi_label_counts}")

pi_sample_weights = [1.0 / pi_label_counts[label] if pi_label_counts[label] > 0 else 0 for label in y_pi_idx]

# Create the sampler
pi_sampler = WeightedRandomSampler(pi_sample_weights, num_samples=len(pi_sample_weights), replacement=True)
print("Created WeightedRandomSampler for Purchase Intention.")

Stance Label Counts: {np.int64(0): np.int64(77), np.int64(1): np.int64(800), np.int64(2): np.int64(123)}
Created WeightedRandomSampler for Stance DEI.


PI Label Counts: {np.int64(0): np.int64(92), np.int64(1): np.int64(838), np.int64(2): np.int64(70)}
Created WeightedRandomSampler for Purchase Intention.


### 4. Format full_text input for DeBERTa-LoRa

In [9]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import numpy as np

# Let's define a lambda function to handle the splitting and NaN cases inline
split_lambda = lambda text: [] if pd.isna(text) else text.split(' → ')

# Apply the lambda function to create a new column with the list of segments
df['text_segments'] = df['full_text'].apply(split_lambda)

# Show examples
print(df[df['full_text'].str.contains(" → ", na=False)][['full_text', 'text_segments']].head().to_markdown(index=False))

| full_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | text_segments                                                                                                                                                                                                                                                     

In [10]:
# Add <CONTEXT></CONTEXT> and <REPLY></REPLY>
def join_segments(segs):
    """
    Re‑assemble the list returned by `split_lambda` into the exact text
    string that will be fed to the tokenizer later on.
    """
    if not segs:                         # blank row guard
        return ""
    if len(segs) == 1:                   # only a reply (no parents)
        return f"<REPLY> {segs[0]} </REPLY>"
    context = " </CONTEXT> <CONTEXT> ".join(segs[:-1])
    reply   = segs[-1]
    return f"<CONTEXT> {context} </CONTEXT> <REPLY> {reply} </REPLY>"

df["joined_text"] = df["text_segments"].apply(join_segments)

sep_token → [SEP]
Decoded preview: <REPLY> the audacity </REPLY>


In [11]:
# Transform joined_text to a dataframe
joined_text = df["joined_text"]
joined_text.head(20)

0                         <REPLY> the audacity </REPLY>
1                            <REPLY> yummmmm., </REPLY>
2                             <REPLY> how much </REPLY>
3     <REPLY> that looks so good!!! it also looks li...
4     <REPLY> i have a friend who wanted to be a pil...
5     <CONTEXT> how much money are your executives g...
6     <CONTEXT> we canceled our costco membership af...
7     <REPLY> passengers must have seat belts on at ...
8     <CONTEXT> lol so many butthurt people keep com...
9     <REPLY> drop dei and want to peddle black hist...
10    <CONTEXT> hey costco! could you bring more pro...
11    <REPLY> much of bastians compensation was a on...
12    <REPLY> end dei! discrimination, exclusion, an...
13    <CONTEXT> amazing there were no fatalities. ku...
14                   <REPLY> jeffrey hickerson </REPLY>
15    <REPLY> costco japan || japanese holiday vlog|...
16    <REPLY> the ceo of delta airlines greatly disa...
17    <CONTEXT> "i have a dream that someday all

### 5. LLM Classification

In [12]:
# ════════════════════════════════════════════════════════════════════
# GPT‑4o evaluation on the full dev‑set (e.g., 1 K items) in `df`
# df must contain columns:
#    joined_text  |  stance_dei_label  |  purchase_intention_label
# ════════════════════════════════════════════════════════════════════
import os, json, time, numpy as np, pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import (f1_score, accuracy_score,
                             classification_report, confusion_matrix)
import openai, warnings
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensures OPENAI_API_KEY is loaded if present in .env
load_dotenv()

# Retrieve the API key
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    print("CRITICAL Error: OPENAI_API_KEY not found after attempting to load from .env.")
    print("Please ensure a .env file with OPENAI_API_KEY is in the search path (e.g., models/sentiment_deberta_model/) and python-dotenv is installed.")
    # Consider raising an Exception here if the key is absolutely required to continue
    # For example: raise ValueError("OpenAI API key not found. Cannot proceed.")
else:
    print("OpenAI API key loaded successfully.")

warnings.filterwarnings("ignore")

# ── label ↔︎ string maps ────────────────────────────────────────────
# Maps integer labels (-1, 0, 1) to string representations used by the LLM
stance_map = {-1:"anti", 0:"neutral", 1:"pro"}
pi_map     = {-1:"boycott", 0:"neutral", 1:"buy"}
# Inverse maps to convert LLM string predictions back to integers
inv_stance = {v:k for k,v in stance_map.items()}
inv_pi     = {v:k for k,v in pi_map.items()}

# ── prompt templates ───────────────────────────────────────────────
# System message defining the task, expected input/output format, and labels for the LLM
SYSTEM_MSG = (
    "You are a research assistant that classifies social-media comments. "
    "Input is formatted as `<REPLY>comment_to_classify</REPLY>` or "
    "`<CONTEXT>parent_comment(s)_text</CONTEXT><REPLY>comment_to_classify</REPLY>`. "
    "Your task is to classify the text within the `<REPLY>` tags. "
    "Use the `<CONTEXT>` text, if provided, for situational awareness to better understand the reply's meaning.\n"
    "Classify the reply on two independent axes using the exact string labels provided below:\n"
    "• Stance toward DEI → \"anti\" / \"neutral\" / \"pro\"\n"
    "• Purchase intention toward the brand → \"boycott\" / \"neutral\" / \"buy\"\n"
    "Return ONLY a single, valid JSON object with keys \"stance\" and \"pi\". The values for these keys MUST be one of the exact string labels provided (e.g., \"pro\", \"neutral\", \"buy\").\n"
    "Ensure the output is a valid JSON string, including double quotes around keys and string values.\n"
    "If the comment is not EXPLICITLY demonstrating stance on DEI or purchase (buying/boycott) intention, classify it as neutral.\n\n"
    "Here are some examples of how to respond:\n\n"
    "Example 1:\n"
    "Input Comment:\n"
    "«<REPLY>Love Costco.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"neutral\"}\n\n"
    "Example 2:\n"
    "Input Comment:\n"
    "«<REPLY>Go woke go broke.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"boycott\"}\n\n"
    "Example 3:\n"
    "Input Comment:\n"
    "«<CONTEXT>We're proud of our diverse workforce!</CONTEXT><REPLY>Thank you for standing up for DEI and what is right, I'll be renewing my membership.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"buy\"}\n\n"
    "Example 4:\n"
    "Input Comment:\n"
    "«<CONTEXT>Our new line is great for everyone.</CONTEXT><REPLY>I support DEI, but I'm not sure if I'll be renewing my membership.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"neutral\"}"
    "Example 4:\n"
    "Input Comment:\n"
    "«<REPLY>you dropped rid of dei? nope. done shopping here.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"boycott\"}\n\n"
    "Example 5:\n"
    "Input Comment:\n"
    "«<CONTEXT> supporting dei is means you want racist hiring </CONTEXT> <REPLY> you need to be educated </REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"neutral\"}\n\n" 
    "Example 6:\n"
    "Input Comment:\n"
    "«<REPLY>Let's boycott this woke pro dei company</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"boycott\"}\n\n"
    "Example 7:\n"
    "Input Comment:\n"
    "«<REPLY>Roll back DEI and then ask us to shop here....nope!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"boycott\"}\n\n"
    "Example 8:\n"
    "Input Comment:\n"
    "«<REPLY>I can't believe they chose diversity over qualifications! When will they get rid of there terrible dei practices.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"neutral\"}\n\n"
    "Example 9:\n"
    "Input Comment:\n"
    "«<CONTEXT>I will no longer shop here because of your policies</CONTEXT> <REPLY>Bye!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"neutral\"}\n\n" 
    "Example 10:\n"
    "Input Comment:\n"
    "«<REPLY>I can't believe you would do this... DEI has to go!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"neutral\"}\n\n"
    "Example 11:\n"
    "Input Comment:\n"
    "«<REPLY>I will not be renewing my membership. One less place to go!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"boycott\"}\n\n"
    "Example 12:\n"
    "Input Comment:\n"
    "«<REPLY>Your commitment to dei hiring has inspired me to become your customer.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"buy\"}\n\n"
)
# Template for formatting the user's input comment for the LLM
USER_TMPL = "Comment:\n«{}»\n\nYour answer:"

# ── helper to query GPT‑4o for a single string ────────────────────
def gpt4o_classify_single_text(text_input):
    """
    Sends a single formatted text comment to the OpenAI API for classification.
    Handles potential markdown fences in the response and JSON parsing errors.
    """
    messages = [
        {"role":"system", "content": SYSTEM_MSG},
        {"role":"user",   "content": USER_TMPL.format(text_input)}
    ]
    # Ensure API key is available before making the call
    if not openai.api_key:
        print("CRITICAL ERROR inside gpt4o_classify_single_text: OpenAI API key is not set.")
        return {"stance": "error_no_api_key", "pi": "error_no_api_key"}
        
    try:
        # Make the API call to OpenAI
        resp = openai.chat.completions.create(
            model="gpt-4o",      # Using the specified model
            temperature=0.0,     # Set for deterministic output
            max_tokens=256,      # Max tokens expected for the JSON response
            messages=messages
        )
        # Extract the raw text content from the response
        raw_content = resp.choices[0].message.content.strip()
        
        # Clean potential markdown code fences (```json ... ``` or ``` ... ```)
        processed_content = raw_content
        if processed_content.startswith("```json"):
            processed_content = processed_content[len("```json"):].strip()
            if processed_content.endswith("```"):
                processed_content = processed_content[:-len("```")].strip()
        elif processed_content.startswith("```"):
            processed_content = processed_content[len("```"):].strip()
            if processed_content.endswith("```"):
                processed_content = processed_content[:-len("```")].strip()

        # Attempt to parse the cleaned content as JSON
        return json.loads(processed_content)

    except openai.APIError as e: # Handle specific OpenAI API errors
        print(f"OpenAI API Error for input '{text_input[:70]}...': {e}. Content: '{raw_content if 'raw_content' in locals() else 'N/A'}'")
        return {"stance": "api_error", "pi": "api_error"}
    except Exception as e: # Handle other errors (e.g., JSON parsing)
        print(f"Error processing input '{text_input[:70]}...': {e}. Processed content: '{processed_content if 'processed_content' in locals() else raw_content if 'raw_content' in locals() else 'N/A'}'")
        return {"stance": "parse_error", "pi": "parse_error"}


# ════════════════════════════════════════════════════════════════════
# Main Processing Logic
# ════════════════════════════════════════════════════════════════════

# Ensure DataFrame 'df' is loaded and prepared from a previous cell
if not ('df' in locals() and isinstance(df, pd.DataFrame) and "joined_text" in df.columns):
    print("Error: DataFrame 'df' is not loaded or 'joined_text' column is missing.")
    print("Please run the cell that loads your data and creates 'joined_text' first.")
    # Optionally raise an error to halt execution if df is required
    # raise RuntimeError("DataFrame 'df' not prepared. Halting execution.")
else:
    print(f"DataFrame 'df' with {len(df)} rows found. Proceeding with full classification.")

    # ── run classification for each comment ─────────────────────────────
    pred_s, pred_p = [], [] # Lists to store string predictions
    
    # Convert 'joined_text' column to a list of strings for efficient iteration
    # Includes basic error handling for conversion
    try:
        joined_texts_list = df["joined_text"].astype(str).tolist()
    except Exception as e:
        print(f"Error converting df['joined_text'] to list of strings: {e}")
        joined_texts_list = [] # Fallback to empty list

    if joined_texts_list and openai.api_key: # Proceed only if texts exist AND API key is set
        print(f"\nStarting classification for {len(joined_texts_list)} comments...")
        # Iterate through each comment text, classify it, and store predictions
        for text_to_classify in tqdm(joined_texts_list, desc="GPT-4o Classifying Full Dataset"):
            # Classify the current text
            result = gpt4o_classify_single_text(text_to_classify)
            # Append results, using .get() for safe access in case of error dicts
            pred_s.append(result.get("stance", "error_key_missing"))
            pred_p.append(result.get("pi", "error_key_missing"))
            # Pause to respect potential API rate limits (adjust as needed)
            time.sleep(1.1) # Roughly corresponds to < 90 requests/minute limit
    elif not openai.api_key:
         print("Halting classification loop: OpenAI API key not available.")
    else:
        print("No texts found in df['joined_text'] to process.")


    # ── map predictions back to integers & calculate metrics ───────────
    # Proceed only if predictions were generated and match the DataFrame length
    if pred_s and pred_p and len(pred_s) == len(df):
        print("\nMapping predictions to integers and calculating metrics...")
        
        # Get true labels from the DataFrame
        y_true_s = df["stance_dei_label"].astype(int).to_numpy()
        y_true_p = df["purchase_intention_label"].astype(int).to_numpy()
        
        # Convert string predictions to integers using inverse maps
        # Default to 0 (neutral) if the string prediction isn't a valid key (e.g., an error string)
        y_pred_s_mapped = np.array([inv_stance.get(x, 0) for x in pred_s])
        y_pred_p_mapped = np.array([inv_pi.get(x, 0) for x in pred_p])

        # Function to display metrics
        def show(axis, y_t, y_p_mapped, class_names_map):
            """Calculates and prints classification metrics."""
            print(f"\n── {axis} Metrics ─────────────────────")
            report_labels = [-1, 0, 1] # Define the order for reports/matrices
            # Get names corresponding to labels, handling potential missing keys
            report_target_names = [class_names_map.get(l, str(l)) for l in report_labels]

            print(f"Macro F1-Score : {f1_score(y_t, y_p_mapped, average='macro', zero_division=0):.4f}")
            print(f"Accuracy       : {accuracy_score(y_t, y_p_mapped):.4f}")
            print("\nConfusion Matrix (Rows: True, Cols: Predicted, Labels: -1, 0, 1):")
            print(confusion_matrix(y_t, y_p_mapped, labels=report_labels))
            print("\nClassification Report:")
            print(classification_report(y_t, y_p_mapped, labels=report_labels, target_names=report_target_names, zero_division=0))

        # Show metrics for Stance and Purchase Intention
        show("Stance", y_true_s, y_pred_s_mapped, stance_map)
        show("PI    ", y_true_p, y_pred_p_mapped, pi_map)

        # ── save predictions to CSV ──────────────────────────────────────
        print("\nSaving predictions and mapped labels to CSV...")
        # Create a copy to avoid modifying the original DataFrame
        df_to_save = df.copy()
        # Add new columns for predictions
        df_to_save["gpt4o_pred_stance_label"] = y_pred_s_mapped # Integer prediction
        df_to_save["gpt4o_pred_pi_label"] = y_pred_p_mapped     # Integer prediction
        df_to_save["gpt4o_pred_stance_str"] = pred_s            # Original string from LLM
        df_to_save["gpt4o_pred_pi_str"] = pred_p                # Original string from LLM
        
        # Define the output file path
        output_csv_path = "dev_1000_with_gpt4o_preds_full.csv" # Consider this name for clarity
        # Save the DataFrame
        try:
            df_to_save.to_csv(output_csv_path, index=False)
            print(f"\nSuccessfully Saved → {output_csv_path}")
        except Exception as e:
            print(f"\nError saving DataFrame to CSV: {e}")
            
    else:
        # Message if predictions weren't generated or didn't match expected length
        print("\nSkipping mapping, metrics, and saving: Predictions were not fully generated or length mismatch detected.")

print("\n--- Full labeled dataset classification cell execution finished ---")

OpenAI API key loaded successfully.
DataFrame 'df' with 1000 rows found. Proceeding with full classification.

Starting classification for 1000 comments...


GPT-4o Classifying Full Dataset:   0%|          | 0/1000 [00:00<?, ?it/s]


Mapping predictions to integers and calculating metrics...

── Stance Metrics ─────────────────────
Macro F1-Score : 0.9245
Accuracy       : 0.9630

Confusion Matrix (Rows: True, Cols: Predicted, Labels: -1, 0, 1):
[[ 72   4   1]
 [  7 782  11]
 [  4  10 109]]

Classification Report:
              precision    recall  f1-score   support

        anti       0.87      0.94      0.90        77
     neutral       0.98      0.98      0.98       800
         pro       0.90      0.89      0.89       123

    accuracy                           0.96      1000
   macro avg       0.92      0.93      0.92      1000
weighted avg       0.96      0.96      0.96      1000


── PI     Metrics ─────────────────────
Macro F1-Score : 0.9044
Accuracy       : 0.9600

Confusion Matrix (Rows: True, Cols: Predicted, Labels: -1, 0, 1):
[[ 80  12   0]
 [  9 823   6]
 [  1  12  57]]

Classification Report:
              precision    recall  f1-score   support

     boycott       0.89      0.87      0.88        9

### 6. Application of LLM Classification

In [13]:
# Define file path relative to the notebook or project root
csv_file_path = '../../data/derived/comments_with_relevance.csv'

# Load the CSV file
df = pd.read_csv(csv_file_path)
print(f"Successfully loaded {csv_file_path}")
print(f"DataFrame shape: {df.shape}")
print("First 5 rows:")
df.head(6)

Successfully loaded ../../data/derived/comments_with_relevance.csv
DataFrame shape: (33060, 17)
First 5 rows:


Unnamed: 0,company_name,post_date,id,parent_id,comment_text,comment_date,comment_type,reaction_count,before_DEI,has_DEI,root_id,depth,sibling_count,time_since_root,cleaned_text,full_text,relevance
0,Delta,2025-02-04 11:57:00,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzExMjM1MDYxMz...,,Flew EYW-ATL-PVD yesterday and all your staff ...,2025-02-08,initial,12,0,1,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzExMjM1MDYxMz...,0,114,0 days,flew eyw-atl-pvd yesterday and all your staff ...,flew eyw-atl-pvd yesterday and all your staff ...,0
1,Delta,2025-02-04 11:57:00,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzExNTQzMTE5OT...,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzExMjM1MDYxMz...,We sincerely apologise for the inconvenience c...,2025-02-08,reply,0,0,1,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzExMjM1MDYxMz...,1,1,0 days,we sincerely apologise for the inconvenience c...,flew eyw-atl-pvd yesterday and all your staff ...,0
2,Delta,2025-02-04 11:57:00,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzEwMjQ1NzAyMj...,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzExMjM1MDYxMz...,Good day don't response to any messages if is ...,2025-02-08,reply,0,0,1,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzExMjM1MDYxMz...,1,1,0 days,good day don't response to any messages if is ...,flew eyw-atl-pvd yesterday and all your staff ...,0
3,Delta,2025-02-04 11:57:00,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzk4OTYxMTEzMz...,,I'm literally on a Delta flight that just land...,2025-02-08,initial,3,0,1,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzk4OTYxMTEzMz...,0,114,0 days,i'm literally on a delta flight that just land...,i'm literally on a delta flight that just land...,0
4,Delta,2025-02-04 11:57:00,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzY0MDEyNzM2NT...,,DEI is not crucial to Delta. Qualified personn...,2025-02-08,initial,25,0,1,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzY0MDEyNzM2NT...,0,114,0 days,dei is not crucial to delta. qualified personn...,dei is not crucial to delta. qualified personn...,1
5,Delta,2025-02-04 11:57:00,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzEzODk4NDgzNT...,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzY0MDEyNzM2NT...,DEI is crucial to every business! Diversity ad...,2025-02-08,reply,4,0,1,Y29tbWVudDoxMDE1MDM2Mzc3MzExOTQxXzY0MDEyNzM2NT...,1,7,0 days,dei is crucial to every business! diversity ad...,dei is not crucial to delta. qualified personn...,1


In [14]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import numpy as np

# Let's define a lambda function to handle the splitting and NaN cases inline
split_lambda = lambda text: [] if pd.isna(text) else text.split(' → ')

# Apply the lambda function to create a new column with the list of segments
df['text_segments'] = df['full_text'].apply(split_lambda)

# Show examples
print(df[df['full_text'].str.contains(" → ", na=False)][['full_text', 'text_segments']].head().to_markdown(index=False))

| full_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | text_segments                                                            

In [17]:
def join_segments(segs):
    """
    Re‑assemble the list returned by `split_lambda` into the exact text
    string that will be fed to the tokenizer later on.
    """
    if not segs: # blank row guard
        return ""
    if len(segs) == 1: # only a reply (no parents)
        return f"<REPLY> {segs[0]} </REPLY>"
    context = " </CONTEXT> <CONTEXT> ".join(segs[:-1])
    reply   = segs[-1]
    return f"<CONTEXT> {context} </CONTEXT> <REPLY> {reply} </REPLY>"

df["joined_text"] = df["text_segments"].apply(join_segments)
print(df["joined_text"])

0        <REPLY> flew eyw-atl-pvd yesterday and all you...
1        <CONTEXT> flew eyw-atl-pvd yesterday and all y...
2        <CONTEXT> flew eyw-atl-pvd yesterday and all y...
3        <REPLY> i'm literally on a delta flight that j...
4        <REPLY> dei is not crucial to delta. qualified...
                               ...                        
33055                                 <REPLY> wed </REPLY>
33056                      <REPLY> follow me page </REPLY>
33057    <REPLY> hello, we are providing apple develope...
33058    <REPLY> for google inc i have developed a norm...
33059                                <REPLY> nice </REPLY>
Name: joined_text, Length: 33060, dtype: object


In [18]:
# Transform joined_text to a dataframe
joined_text = df["joined_text"]
joined_text.head(20)

0     <REPLY> flew eyw-atl-pvd yesterday and all you...
1     <CONTEXT> flew eyw-atl-pvd yesterday and all y...
2     <CONTEXT> flew eyw-atl-pvd yesterday and all y...
3     <REPLY> i'm literally on a delta flight that j...
4     <REPLY> dei is not crucial to delta. qualified...
5     <CONTEXT> dei is not crucial to delta. qualifi...
6     <CONTEXT> dei is not crucial to delta. qualifi...
7     <CONTEXT> dei is crucial to every business! di...
8     <CONTEXT> no. what dei adds is unqualified peo...
9     <CONTEXT> that you think dei hires are in ever...
10    <CONTEXT> well, i guess that makes you a trans...
11    <CONTEXT> you need to face facts. people. dei ...
12    <CONTEXT> it doesn't. but hey, keep telling yo...
13    <CONTEXT> well bless your little old heart. </...
14    <CONTEXT> well bless your little old heart. </...
15    <CONTEXT> blowing out someone else's candle do...
16    <CONTEXT> could it also be true that pretendin...
17    <CONTEXT> well bless your little old heart

In [19]:
# ════════════════════════════════════════════════════════════════════
# GPT‑4o classification for the full dataset.
# Input DataFrame 'df' must contain 'joined_text'.
# Output will include original df columns + new prediction columns.
# ════════════════════════════════════════════════════════════════════
import os, json, time, numpy as np, pandas as pd
from tqdm.auto import tqdm
import openai, warnings
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensures OPENAI_API_KEY is loaded if present in .env
load_dotenv()

# Retrieve the API key
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    print("CRITICAL Error: OPENAI_API_KEY not found after attempting to load from .env.")
    print("Please ensure a .env file with OPENAI_API_KEY is in the search path (e.g., models/sentiment_deberta_model/) and python-dotenv is installed.")
else:
    print("OpenAI API key loaded successfully.")

warnings.filterwarnings("ignore")

# ── label ↔︎ string maps ────────────────────────────────────────────
# Maps integer labels (-1, 0, 1) to string representations used by the LLM
stance_map = {-1:"anti", 0:"neutral", 1:"pro"}
pi_map     = {-1:"boycott", 0:"neutral", 1:"buy"}
# Inverse maps to convert LLM string predictions back to integers
inv_stance = {v:k for k,v in stance_map.items()}
inv_pi     = {v:k for k,v in pi_map.items()}

# ── prompt templates ───────────────────────────────────────────────
# System message defining the task, expected input/output format, and labels for the LLM
SYSTEM_MSG = (
    "You are a research assistant that classifies social-media comments. "
    "Input is formatted as `<REPLY>comment_to_classify</REPLY>` or "
    "`<CONTEXT>parent_comment(s)_text</CONTEXT><REPLY>comment_to_classify</REPLY>`. "
    "Your task is to classify the text within the `<REPLY>` tags. "
    "Use the `<CONTEXT>` text, if provided, for situational awareness to better understand the reply's meaning.\n"
    "Classify the reply on two independent axes using the exact string labels provided below:\n"
    "• Stance toward DEI → \"anti\" / \"neutral\" / \"pro\"\n"
    "• Purchase intention toward the brand → \"boycott\" / \"neutral\" / \"buy\"\n"
    "Return ONLY a single, valid JSON object with keys \"stance\" and \"pi\". The values for these keys MUST be one of the exact string labels provided (e.g., \"pro\", \"neutral\", \"buy\").\n"
    "Ensure the output is a valid JSON string, including double quotes around keys and string values.\n"
    "If the comment is not EXPLICITLY demonstrating stance on DEI or purchase (buying/boycott) intention, classify it as neutral.\n\n"
    "Here are some examples of how to respond:\n\n"
    "Example 1:\n"
    "Input Comment:\n"
    "«<REPLY>Love Costco.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"neutral\"}\n\n"
    "Example 2:\n"
    "Input Comment:\n"
    "«<REPLY>Go woke go broke.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"boycott\"}\n\n"
    "Example 3:\n"
    "Input Comment:\n"
    "«<CONTEXT>We're proud of our diverse workforce!</CONTEXT><REPLY>Thank you for standing up for DEI and what is right, I'll be renewing my membership.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"buy\"}\n\n"
    "Example 4:\n"
    "Input Comment:\n"
    "«<CONTEXT>Our new line is great for everyone.</CONTEXT><REPLY>I support DEI, but I'm not sure if I'll be renewing my membership.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"neutral\"}\n\n" 
    "Example 5:\n"
    "Input Comment:\n"
    "«<REPLY>you dropped rid of dei? nope. done shopping here.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"boycott\"}\n\n" 
    "Example 6:\n"
    "Input Comment:\n"
    "«<CONTEXT> supporting dei is means you want racist hiring </CONTEXT> <REPLY> you need to be educated </REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"neutral\"}\n\n" 
    "Example 7:\n"
    "Input Comment:\n"
    "«<REPLY>Let's boycott this woke pro dei company</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"boycott\"}\n\n"
    "Example 8:\n"
    "Input Comment:\n"
    "«<REPLY>Roll back DEI and then ask us to shop here....nope!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"boycott\"}\n\n"
    "Example 9:\n"
    "Input Comment:\n"
    "«<REPLY>I can't believe they chose diversity over qualifications! When will they get rid of there terrible dei practices.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"neutral\"}\n\n"
    "Example 10:\n"
    "Input Comment:\n"
    "«<CONTEXT>I will no longer shop here because of your policies</CONTEXT> <REPLY>Bye!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"neutral\"}\n\n" 
    "Example 11:\n"
    "Input Comment:\n"
    "«<REPLY>I can't believe you would do this... DEI has to go!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"anti\", \"pi\": \"neutral\"}\n\n"
    "Example 12:\n"
    "Input Comment:\n"
    "«<REPLY>I will not be renewing my membership. One less place to go!</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"neutral\", \"pi\": \"boycott\"}\n\n"
    "Example 13:\n" 
    "Input Comment:\n"
    "«<REPLY>Your commitment to dei hiring has inspired me to become your customer.</REPLY>»\n\n"
    "Your answer (expected JSON output):\n"
    "{\"stance\": \"pro\", \"pi\": \"buy\"}\n\n"
)
# Template for formatting the user's input comment for the LLM
USER_TMPL = "Comment:\n«{}»\n\nYour answer:"

# ── helper to query GPT‑4o for a single string ────────────────────
def gpt4o_classify_single_text(text_input):
    """
    Sends a single formatted text comment to the OpenAI API for classification.
    Handles potential markdown fences in the response and JSON parsing errors.
    """
    messages = [
        {"role":"system", "content": SYSTEM_MSG},
        {"role":"user",   "content": USER_TMPL.format(text_input)}
    ]
    if not openai.api_key:
        print("CRITICAL ERROR inside gpt4o_classify_single_text: OpenAI API key is not set.")
        return {"stance": "error_no_api_key", "pi": "error_no_api_key"}
        
    try:
        resp = openai.chat.completions.create(
            model="gpt-4o",
            temperature=0.0,
            max_tokens=256,
            messages=messages
        )
        raw_content = resp.choices[0].message.content.strip()
        
        processed_content = raw_content
        if processed_content.startswith("```json"):
            processed_content = processed_content[len("```json"):].strip()
            if processed_content.endswith("```"):
                processed_content = processed_content[:-len("```")].strip()
        elif processed_content.startswith("```"):
            processed_content = processed_content[len("```"):].strip()
            if processed_content.endswith("```"):
                processed_content = processed_content[:-len("```")].strip()

        return json.loads(processed_content)

    except openai.APIError as e:
        print(f"OpenAI API Error for input '{text_input[:70]}...': {e}. Content: '{raw_content if 'raw_content' in locals() else 'N/A'}'")
        return {"stance": "api_error", "pi": "api_error"}
    except Exception as e:
        print(f"Error processing input '{text_input[:70]}...': {e}. Processed content: '{processed_content if 'processed_content' in locals() else raw_content if 'raw_content' in locals() else 'N/A'}'")
        return {"stance": "parse_error", "pi": "parse_error"}


# ════════════════════════════════════════════════════════════════════
# Main Processing Logic
# ════════════════════════════════════════════════════════════════════

if not ('df' in locals() and isinstance(df, pd.DataFrame) and "joined_text" in df.columns):
    print("Error: DataFrame 'df' is not loaded or 'joined_text' column is missing.")
    print("Please run the cell that loads your data and creates 'joined_text' first.")
else:
    print(f"DataFrame 'df' with {len(df)} rows found. Proceeding with full classification.")

    pred_s, pred_p = [], []
    
    try:
        joined_texts_list = df["joined_text"].astype(str).tolist()
    except Exception as e:
        print(f"Error converting df['joined_text'] to list of strings: {e}")
        joined_texts_list = []

    if joined_texts_list and openai.api_key:
        print(f"\nStarting classification for {len(joined_texts_list)} comments...")
        for text_to_classify in tqdm(joined_texts_list, desc="GPT-4o Classifying Full Dataset"):
            result = gpt4o_classify_single_text(text_to_classify)
            pred_s.append(result.get("stance", "error_key_missing"))
            pred_p.append(result.get("pi", "error_key_missing"))
            time.sleep(1.1) 
    elif not openai.api_key:
         print("Halting classification loop: OpenAI API key not available.")
    else:
        print("No texts found in df['joined_text'] to process.")

    # ── Map predictions and Save to CSV ─────────────────────────────────
    if pred_s and pred_p and len(pred_s) == len(df):
        print("\nMapping predictions to integers...")
        # Convert string predictions to integers, defaulting to 0 (neutral) for errors/unknowns
        y_pred_s_mapped = np.array([inv_stance.get(x, 0) for x in pred_s])
        y_pred_p_mapped = np.array([inv_pi.get(x, 0) for x in pred_p])

        print("\nSaving predictions and mapped labels to CSV...")
        df_to_save = df.copy()
        df_to_save["gpt4o_pred_stance_label"] = y_pred_s_mapped
        df_to_save["gpt4o_pred_pi_label"] = y_pred_p_mapped
        df_to_save["gpt4o_pred_stance_str"] = pred_s
        df_to_save["gpt4o_pred_pi_str"] = pred_p
        
        output_csv_path = "../../data/derived/comments_with_sentiment.csv"
        try:
            df_to_save.to_csv(output_csv_path, index=False)
            print(f"\nSuccessfully Saved → {output_csv_path}")
        except Exception as e:
            print(f"\nError saving DataFrame to CSV: {e}")
            
    else:
        print("\nSkipping saving: Predictions were not fully generated or length mismatch detected.")

print("\n--- Full dataset classification cell execution finished ---")

OpenAI API key loaded successfully.
DataFrame 'df' with 33060 rows found. Proceeding with full classification.

Starting classification for 33060 comments...


GPT-4o Classifying Full Dataset:   0%|          | 0/33060 [00:00<?, ?it/s]

OpenAI API Error for input '<REPLY> i'm sure you don't want my dei money. i'll get flowers at cost...': Connection error.. Content: 'N/A'
OpenAI API Error for input '<CONTEXT> i'm sure you don't want my dei money. i'll get flowers at co...': Connection error.. Content: 'N/A'
OpenAI API Error for input '<CONTEXT> i'm sure you don't want my dei money. i'll get flowers at co...': Connection error.. Content: 'N/A'
OpenAI API Error for input '<REPLY> my budget thanks you for your dei policy roll backs, we are sa...': Connection error.. Content: 'N/A'
OpenAI API Error for input '<REPLY> youre off my shopping list target. sorry about that. i stuck b...': Connection error.. Content: 'N/A'
OpenAI API Error for input '<REPLY> bring back dei!!! </REPLY>...': Connection error.. Content: 'N/A'
OpenAI API Error for input '<REPLY> no more a target for me </REPLY>...': Connection error.. Content: 'N/A'
OpenAI API Error for input '<REPLY> my dei money will be going elsewhere this year, sorry! </REPLY..