In [8]:
!pip install openai

import sys
!{sys.executable} -m pip install dotenv

!pip install dotenv

import openai



In [9]:
from openai import OpenAI, AzureOpenAI
import json
import pandas as pd
import regex as re
from dotenv import load_dotenv
import os

load_dotenv()  # Load from .env file

AZURE_API_KEY = 
AZURE_ENDPOINT = 
AZURE_DEPLOYMENT = "gpt-4o-mini"
AZURE_API_VERSION = "2025-04-01-preview"



openai = AzureOpenAI(
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version=AZURE_API_VERSION
)

In [10]:
import pandas as pd
df = pd.read_csv("BLM_prepost_final.csv")

In [11]:
# Limit the number of posts sent to GPT
posts = df["post_body_text"].dropna().tolist()

# Format the post list into a numbered string
formatted_posts = "\n".join([f"{i+1}. {post.strip()}" for i, post in enumerate(posts)])


system_message = {
  "role": "system",
  "content": """
You are a leading expert in racial discourse and stereotype analysis. Your task is to analyze each social media post and classify it according to **stereotype-related portrayals of Black individuals**, including Black men, Black women, or Black people/person more generally.

Your goal is to evaluate whether the post reinforces, contradicts, or remains neutral toward common **stereotypes about Black people** based on the following **five stereotype dimensions**. Use the examples (both stereotype-aligned and counter-stereotypical) to guide your classification.

---

**1. Warmth/Hostility (Friendliness / Hostile):**
Reflects whether Black individuals are portrayed as kind, emotionally supportive, or caring — versus cold, hostile, or threatening.
- **Stereotypical portrayals (Negative valence):** dangerous, criminal, aggressive, quick-to-anger, thug, gangster, sassy
- **Counter-stereotypes (Positive valence):** self-sacrificing, nurturing, caring, emotional, gentle, compassionate

**2. Capability/Incapability (Competence / Incompetence):**
Captures portrayals related to intelligence, independence, or strength — versus incompetence or laziness.
- **Stereotypical portrayals (Negative valence):** unintelligent, uneducated, lazy, irresponsible, ignorant, dependent, submissive
- **Counter-stereotypes (Positive valence):** strong, athletic, independent, resilient, intelligent, leaders, bold, ambitious

**3. Assertiveness (Expressiveness / Attitude):**
Describes depictions of Black individuals as confident or outspoken — or framed as having an “attitude” or being overly aggressive.
- **Stereotypical portrayals (Negative valence):** loud, have-an-attitude, aggressive, dominant
- **Counter-stereotypes (Positive valence):** confident, assertive, proud, articulate, self-assured

**4. Status (Class / Respectability):**
Focuses on class-based or cultural depictions — including whether someone is seen as respectable or degraded.
- **Stereotypical portrayals (Negative valence):** ghetto, unrefined, poor, dirty
- **Counter-stereotypes (Positive valence):** cultured, sophisticated, professional, successful, well-dressed

**5. Victimhood (Oppression / Harm):**
Represents portrayals of Black individuals as victims of systemic or individual harm, violence, or discrimination.
- **Negative stereotype alignment:** This dimension is not necessarily negative — it indicates portrayal as oppressed or harmed.
- **Examples:** oppressed, violated, attacked, assaulted, discriminated against, targeted, profiled
- **Counter-stereotype (positive framing):** protected, defended, supported, vindicated

**6. Sexualization (Hypersexualization / Gender Stereotypes):**
Captures portrayals that present Black individuals as excessively sexualized or emasculated.
- **Stereotypical portrayals (Negative valence):** hypersexual, promiscuous, unfeminine, seductive, emasculating, predatory, jezebel, overly sexual
- **Counter-stereotypes (Positive valence):** modest, respectful, loving, demure, emotionally intimate


---

For each social media post, return a JSON object with the following fields:

1. **"clarification"**: A one-sentence summary of what the post implies about Black individuals.

2. **"dimension"**: The **most relevant** stereotype dimension:
   `"Warmth/Hostility"`, `"Capability/Incapability"`, `"Assertiveness"`, `"Status"`, `"Victimhood"`, or `"Sexualization"`. Only use these 6 dimensions.

3. **"stereotype_term"** – Identify the word or phrase from the post that reflects the stereotype dimension, either aligning or counteracting with the stereotype dimension.

4. **"valence"**: A score describing how the stereotype is presented:
   - `-1`: Subtle or implicit negative stereotype
   - `0`: Neutral or stereotype-irrelevant
   - `+1`: Subtle positive or counter-stereotypical portrayal

5. **"group"**: State the relevant demographic group(s) the post is referring to (“Black men,” “Black women”, "Black people", “Black person”, "Black community", "Black family", "Female", "Male".).

6. **"stereotype_origin"** – Based on common stereotype associations in research, identify which group the stereotype term most commonly applies to. (e.g., "dangerous" → Black men, "submissive" → female)

7. **"rationale"**: A concise explanation (2–4 sentences) describing:
   - Why this dimension was chosen
   - How the post reflects the stereotype dimension
   - Why this valence score was given
   - How the group was identified

8. **"confidence"**: Your confidence level in this classification:
   - `"uncertain"` = the post is ambiguous or unclear
   - `"fairly certain"` = moderately confident in your choice
   - `"very certain"` = highly confident the classification is accurate

Return your response as a JSON list of objects with this format:

{
  "post": "<original text>",
  "clarification": "<short clarification>",
  "dimension": "<best-matching stereotype dimension>",
  "stereotype_term": "<term or phrase from the post>",
  "valence": -1 | 0 | 1,
  "group": "<demographic group>",
  "stereotype_origin": "<group stereotype most associated with>",
  "rationale": "<your explanation>",
  "confidence": "unsure" | "fairly certain" | "very certain"
"""
}

example_jsons = [
  {  # Warmth/Hostility
    "post": "Black families are some of the most welcoming and generous people you will ever meet. Treat them with the respect they deserve.",
    "clarification": "The post praises Black families as warm, welcoming, and generous",
    "dimension": "warmth/hostility",
    "stereotype_term": "welcoming",
    "valence": 1,
    "group": "Black families",
    "stereotype_origin": "Black community",
    "rationale": "By describing Black families as welcoming and generous, the post strongly affirms positive warmth traits while explicitly rejecting negative or hostile stereotypes.",
    "confidence": "very certain"
  },
  {  # Warmth/Hostility
    "post": "If Black and brown people showed up armed like this, the media would call them dangerous and the police would open fire.",
    "clarification": "The post critiques how Black and brown people are perceived as threatening when armed.",
    "dimension": "warmth/hostility",
    "stereotype_term": "dangerous",
    "valence": -1,
    "group": "Black people",
    "stereotype_origin": "Black and brown people",
    "rationale": "The post highlights the stereotype of Black and brown people as inherently threatening, revealing a lack of perceived warmth.",
    "confidence": "very certain"
  },

  {  # Capability/Incapability
    "post": "Lila A. Fenwick, 87, New York City, first black woman to graduate from Harvard Law School.",
    "clarification": "The post highlights a Black woman’s educational achievements.",
    "dimension": "capability/incapability",
    "stereotype_term": "first black woman to graduade from Harvard Law",
    "valence": 1,
    "group": "Black women",
    "stereotype_origin": "Black women",
    "rationale": "Counters the stereotype of Black women being uneducated by affirming their academic excellence.",
    "confidence": "very certain"
  },
  {  # Capability/Incapability
    "post": "Black women attorneys are leading the fight against attacks on civil rights.",
    "clarification": "The post highlights Black women lawyers as leaders in defending civil liberties.",
    "dimension": "capability/incapability",
    "stereotype_term": "leading",
    "valence": 1,
    "group": "Black women",
    "stereotype_origin": "Black women",
    "rationale": "Portrays Black women as competent and effective leaders, reinforcing the capability dimension.",
    "confidence": "very certain"
  },
  {  # Assertiveness
    "post": "As a fellow Black woman who has run for office, I know how hard campaigning is. Kamala Harris did it with courage, grace, and grit.",
    "published_date": "2020-08-20",
    "clarification": "The post praises Kamala Harris for her leadership and representation.",
    "dimension": "assertiveness",
    "stereotype_term": "courage",
    "valence": 1,
    "group": "Black women",
    "stereotype_origin": "Black women",
    "rationale": "Highlights leadership, confidence, and persistence, aligning with positive assertiveness.",
    "confidence": "very certain"
  },
  {  # Assertiveness
    "post": "Black people are saying ‘Geroge floyd's death should be protested until things change.’",
    "published_date": "2020-06-02",
    "clarification": "The post encourages protest as a response to injustice.",
    "dimension": "assertiveness",
    "stereotype_term": "protest",
    "valence": 1,
    "group": "Black people",
    "stereotype_origin": "Black people",
    "rationale": "Encourages standing up for rights and collective action, reflecting high assertiveness.",
    "confidence": "fairly certain"
  },
  {  # Status
    "post": "#COVID19 is a deadly crisis that will disproportionately harm Black people due to systemic failures.",
    "published_date": "2020-04-10",
    "clarification": "The post discusses systemic failures harming Black communities during COVID-19.",
    "dimension": "status",
    "stereotype_term": "disproportionately harm",
    "valence": -1,
    "group": "Black people",
    "stereotype_origin": "Black people",
    "rationale": "Highlights structural inequality and vulnerability, signaling lower societal status.",
    "confidence": "fairly certain"
  },
  {  # Status
    "post": "Subscribe now to get exclusive access to HBR’s ‘Advancing Black Leaders’ program.",
    "published_date": "2020-03-18",
    "clarification": "Promotes a program to elevate Black professionals into leadership roles.",
    "dimension": "status",
    "stereotype_term": "leaders",
    "valence": 1,
    "group": "Black people",
    "stereotype_origin": "Black people",
    "rationale": "Affirms leadership and prestige, countering stereotypes of low status.",
    "confidence": "very certain"
  },
  {  # Victimhood
    "post": "Breonna Taylor was asleep in her bed when police stormed in and killed her. She never had a chance.",
    "published_date": "2020-06-05",
    "clarification": "The post highlights the unjust killing of Breonna Taylor while she was asleep.",
    "dimension": "victimhood",
    "stereotype_term": "never had a chance",
    "valence": -1,
    "group": "Black women",
    "stereotype_origin": "Black women",
    "rationale": "Depicts a Black woman as an innocent victim of systemic violence, aligning with victimhood.",
    "confidence": "very certain"
  },
  {  # Victimhood
    "post": "Every Black man in America knows the fear of being pulled over — it could be the last time you see your family.",
    "published_date": "2019-11-31",
    "clarification": "The post underscores the life-threatening risks Black men face during police encounters.",
    "dimension": "victimhood",
    "stereotype_term": "fear of being pulled over",
    "valence": -1,
    "group": "Black men",
    "stereotype_origin": "Black men",
    "rationale": "Portrays Black men as vulnerable to systemic oppression and danger in everyday situations.",
    "confidence": "very certain"
  },
  {  # Sexualization
    "post": "R. Eric Thomas writes about growing up as a gay Black teenager and finding love in an unexpected place.",
    "published_date": "2010-01-22",
    "clarification": "A personal story about a gay Black man and his experiences with love.",
    "dimension": "sexualization",
    "stereotype_term": "love",
    "valence": 1,
    "group": "Black men",
    "stereotype_origin": "Black men",
    "rationale": "Humanizes queer Black male narratives, countering hypersexual or emasculating stereotypes.",
    "confidence": "fairly certain"
  },
  {  # Sexualization
    "post": "A new study finds that perm hair dyes and relaxers are especially cancer-causing for Black women.",
    "published_date": "2019-12-04",
    "clarification": "Announces health risks tied to beauty products used by Black women.",
    "dimension": "sexualization",
    "stereotype_term": "hair relaxers",
    "valence": -1,
    "group": "Black women",
    "stereotype_origin": "Black women",
    "rationale": "Connects beauty practices to femininity and desirability norms, tied to sexualized gender expectations.",
    "confidence": "uncertain"
  }
]


In [12]:
import re
import json
import pandas as pd
import time

def safe_parse_json(output_text, batch_num):
    """Try to parse JSON output; save raw text if invalid."""
    try:
        return json.loads(output_text)
    except json.JSONDecodeError:
        match = re.search(r'\[.*\]', output_text, re.DOTALL)
        if match:
            try:
                return json.loads(match.group())
            except json.JSONDecodeError:
                print(f"⚠️ Batch {batch_num}: JSON invalid even after repair.")
                with open(f"batch_{batch_num}_raw.txt", "w", encoding="utf-8") as f:
                    f.write(output_text)
                return []
        else:
            print(f"⚠️ Batch {batch_num}: No JSON array found. Raw output saved.")
            with open(f"batch_{batch_num}_raw.txt", "w", encoding="utf-8") as f:
                f.write(output_text)
            return []

import json
import pandas as pd
import time

def analyze_dataset(df, batch_size=50, output_path="BLM_classification_analysis.csv"):
    posts = df["post_body_text"].dropna().tolist()
    all_results = []

    for start in range(0, len(posts), batch_size):
        batch_num = start // batch_size + 1
        batch = posts[start:start+batch_size]
        formatted_posts = "\n".join([f"{i+1}. {post.strip()}" for i, post in enumerate(batch)])

        user_message = {
            "role": "user",
            "content": f"""Here are some examples of how to analyze posts:\n\n{json.dumps(example_jsons, indent=2)}

Now analyze the following posts:
Return ONLY a JSON array with no explanations.\n\n{formatted_posts}"""
        }

        messages = [system_message, user_message]

        for attempt in range(2):
            try:
                response = openai.chat.completions.create(
                    model=AZURE_DEPLOYMENT,  
                    messages=messages
                )
                output_text = response.choices[0].message.content

                batch_results = safe_parse_json(output_text, batch_num)

                if batch_results:
                    all_results.extend(batch_results)

                    # Save after each batch (so you don't lose progress)
                    pd.DataFrame(all_results).to_csv(output_path, index=False)
                    print(f"✅ Batch {batch_num} saved to {output_path}")
                    break
            except Exception as e:
                print(f"⚠️ Error in batch {batch_num}, attempt {attempt+1}: {e}")
                time.sleep(3)
        else:
            print(f"❌ Batch {batch_num} failed after 2 attempts.")

    print(f"\n✅ Full analysis saved to {output_path}")
    files.download(output_path)
    return pd.DataFrame(all_results)


In [13]:
df_results = analyze_dataset(df, batch_size=20)

✅ Batch 1 saved to BLM_classification_analysis.csv
✅ Batch 2 saved to BLM_classification_analysis.csv
✅ Batch 3 saved to BLM_classification_analysis.csv
⚠️ Batch 4: JSON invalid even after repair.
⚠️ Batch 4: JSON invalid even after repair.
❌ Batch 4 failed after 2 attempts.
✅ Batch 5 saved to BLM_classification_analysis.csv
✅ Batch 6 saved to BLM_classification_analysis.csv
✅ Batch 7 saved to BLM_classification_analysis.csv
✅ Batch 8 saved to BLM_classification_analysis.csv
✅ Batch 9 saved to BLM_classification_analysis.csv
✅ Batch 10 saved to BLM_classification_analysis.csv
✅ Batch 11 saved to BLM_classification_analysis.csv
✅ Batch 12 saved to BLM_classification_analysis.csv
✅ Batch 13 saved to BLM_classification_analysis.csv
✅ Batch 14 saved to BLM_classification_analysis.csv
✅ Batch 15 saved to BLM_classification_analysis.csv
✅ Batch 16 saved to BLM_classification_analysis.csv
✅ Batch 17 saved to BLM_classification_analysis.csv
✅ Batch 18 saved to BLM_classification_analysis.csv
⚠

NameError: name 'files' is not defined

In [34]:
pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Adding dates through merging and matching post text 
import pandas as pd
from rapidfuzz import process, fuzz


classification_df = pd.read_csv("BLM_classification_analysis.csv")
prepost_df = pd.read_csv("BLM_prepost_final.csv")


post_to_date = dict(zip(prepost_df["post_body_text"], prepost_df["published_at"]))

# List of all post_body_text values for matching
post_list = list(prepost_df["post_body_text"])

matched_dates = []
match_scores = []

# Loop through each classification post and find closest match
for post in classification_df["post"]:
    if pd.isna(post) or not isinstance(post, str):
        matched_dates.append(None)
        match_scores.append(None)
        continue

    match, score, _ = process.extractOne(
        post, post_list, scorer=fuzz.token_sort_ratio
    )

    if score >= 80:  # threshold for good match
        matched_dates.append(post_to_date[match])
        match_scores.append(score)
    else:
        matched_dates.append(None)
        match_scores.append(score)


classification_df["published_at"] = matched_dates
classification_df["match_score"] = match_scores


classification_df.to_csv("BLM_classification_with_dates_fuzzy.csv", index=False)

print("Matching complete. File saved: BLM_classification_with_dates_fuzzy.csv")


✅ Matching complete. File saved: BLM_classification_with_dates_fuzzy.csv


In [47]:
# Cleaning data
df = pd.read_csv("BLM_classification_with_dates_fuzzy.csv")

# 1) Remove rows where "published_at" is empty (NaN or blank)
df = df[df["published_at"].notna()]
df = df[df["published_at"].astype(str).str.strip() != ""] 

# 2) Remove rows where "dimension" is "0"
df = df[df["dimension"] != 0]  
df = df[df["dimension"].astype(str).str.strip() != "0"]  

# 3) Remove rows where "confidence" is "unsure"
df = df[df["confidence"].astype(str).str.strip().str.lower() != "unsure"]
 
# 4) Remove unwanted columns by index 
df = df.drop(df.columns[[9, 10, 12]], axis=1)

df.to_csv("BLM_classification_analysis_cleaned.csv", index=False)

print(f"✅ Cleaned file saved. Rows: {len(df)}")

✅ Cleaned file saved. Rows: 7448
