In [1]:
!pip install openai
!pip install python-dotenv



In [3]:
from openai import OpenAI, AzureOpenAI
import json
import pandas as pd
import regex as re
import time
from dotenv import load_dotenv
import os

load_dotenv() 

AZURE_ENDPOINT = 
AZURE_API_KEY = 
AZURE_DEPLOYMENT = "gpt-4o-mini"
AZURE_API_VERSION = "2025-04-01-preview"


openai = AzureOpenAI(
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version=AZURE_API_VERSION
)

In [20]:
AZURE_DEPLOYMENT = "gpt-4o-mini"

input_file = "pre_blm_cleaned2.csv"
output_file = "pre_blm_relevant_scores.csv"
TEXT_COLUMN = "post_body_text"
ID_COLUMN = "PostId"   # Match your CSV exactly
BATCH_SIZE = 100

df = pd.read_csv(input_file)
df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str)

all_scores = []

def score_relevance():
    posts = df[[ID_COLUMN, TEXT_COLUMN]].dropna().to_dict("records")

    for start in range(0, len(posts), BATCH_SIZE):
        batch_num = start // BATCH_SIZE + 1
        batch = posts[start:start + BATCH_SIZE]

        formatted_posts = "\n".join([
            f"[{p[ID_COLUMN]}] {p[TEXT_COLUMN]}" for p in batch
        ])

        user_msg = {
            "role": "user",
            "content": f"""
Analyze each post to decide whether a post is relevant to discussions about Black people in the United States and give it a score from 1 to 5 for each post.  
"Relevant" means the post refers to, discusses, depicts, or implies something about Black people(black men, women, community, person, etc.) — including culture, politics, experiences, discrimination, representation, history, news, or social issues.

A post is relevant if it:
- Mentions Black people or African Americans in a cultural, political, historical, social, or identity context.
- Describes personal experiences of being Black, being treated differently for being Black, or interactions involving Black identity.
- References discrimination, prejudice, bias, stereotypes, or social movements involving Black people.
- Discusses achievements, challenges, victimization, leadership, activism, or community events involving Black people.
- Implies perceptions of Black people as hostile, nurturing, threatening, assertive, sexualized, competent, incompetent, victimized, or low in status — even if not explicitly stated.

### Examples of RELEVANT posts that would have high scores of relevancy:
1. "This is horrific. A president should never glorify police brutality. To our Black community, know that I see you and will stand up for justice."
2. "Weak, afraid people attack strong, successful Black women. That's a fact."
3. "I met investors who did not invest in my business because I was a Black woman."
4. "RT @msshanitarenee: Kamala Harris is every Black woman who had to remain calm in a meeting when lesser qualified white men spoke over her."
5. "A Black man running is a death sentence in America."
6. "The overwhelming majority of Black Americans say the U.S. does not need to import more foreign workers to fill jobs."
7. "Lila A. Fenwick, 87, first Black woman to graduate from Harvard Law School."

Rate each post for relevancy to discussions about Black people in the U.S. using 1–5 scale:
1 = Not relevant, 5 = Very relevant.

Return ONLY JSON:
[
  {{"PostId": "<post_id_here>", "RelevanceScore": 1-5}}
]

Posts:
{formatted_posts}
"""
        }

        for attempt in range(2):
            try:
                response = openai.chat.completions.create(
                    model=AZURE_DEPLOYMENT,
                    messages=[
                        {"role": "system", "content": "You are an expert in identifying and rating social media posts about discourse surrounding Black people in the U.S."},
                        user_msg
                    ],
                    temperature=0
                )
                output_text = response.choices[0].message.content.strip()

                # DEBUG: See what model returned
                print(f"\n--- Batch {batch_num} raw output ---")
                print(output_text[:500])  # only show first 500 chars

                try:
                    results = json.loads(output_text)
                except:
                    match = re.search(r'\[.*\]', output_text, re.DOTALL)
                    results = json.loads(match.group()) if match else []

                if not results:
                    print(f"⚠️ No results parsed for batch {batch_num}")

                all_scores.extend(results)

                print(f"✅ Batch {batch_num} complete, {len(results)} scored")
                break
            except Exception as e:
                print(f"⚠️ Batch {batch_num} failed: {e}")
                time.sleep(2)
        else:
            print(f"❌ Batch {batch_num} failed after retries")

    # Save to CSV
    if all_scores:
        pd.DataFrame(all_scores).to_csv(output_file, index=False)
        print(f"\n✅ Relevance scores saved to {output_file}")
    else:
        print("\n❌ No scores generated. Check output_text above for parsing issues.")


In [21]:
# relevancy scores for pre_blm dataset
input_file = "pre_blm_cleaned2.csv"  
output_file = "pre_blm_relevant_scores.csv"
TEXT_COLUMN = "post_body_text"
ID_COLUMN = "PostId"
BATCH_SIZE = 30 
MODEL = "gpt-4o-mini" 

# Parsing
df = pd.read_csv(input_file)
df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str)

all_scores = []  


def safe_json_parse(text):
    try:
        return json.loads(text)
    except:
        matches = re.findall(r'\{[^}]+\}', text)
        results = []
        for m in matches:
            try:
                obj = json.loads(m + "}") if not m.endswith("}") else json.loads(m)
                results.append(obj)
            except:
                continue
        return results


# MAIN FUNCTION
def score_relevance():
    posts = df[[ID_COLUMN, TEXT_COLUMN]].dropna().to_dict("records")

    for start in range(0, len(posts), BATCH_SIZE):
        batch_num = start // BATCH_SIZE + 1
        batch = posts[start:start + BATCH_SIZE]

        formatted_posts = "\n".join([f"[{p[ID_COLUMN]}] {p[TEXT_COLUMN]}" for p in batch])

        user_msg = {
            "role": "user",
            "content": f"""
Rate each post from 1 to 5 for relevancy to discussions about Black people in the U.S.

A post is considered relevant if it: 
- Mentions Black people or African Americans in a cultural, political, historical, social, or identity context.
- Describes personal experiences of being Black, being treated differently for being Black, or interactions involving Black identity.
- References discrimination, prejudice, bias, stereotypes, or social movements involving Black people.
- Discusses achievements, challenges, victimization, leadership, activism, or community events involving Black people.
- Implies perceptions of Black people as hostile, nurturing, threatening, assertive, sexualized, competent, incompetent, victimized, or low in status — even if not explicitly stated.

Relevance scale:
1 = Not relevant at all (e.g., "Black Friday sale")
5 = Very relevant, explicit or implicit discourse about Black people, stereotypes, discrimination, representation

Return ONLY JSON in this exact format:
[
  {{"PostId": "123", "RelevanceScore": 3}},
  {{"PostId": "456", "RelevanceScore": 5}}
]

Posts:
{formatted_posts}
"""
        }

        for attempt in range(2):  
            try:
                response = openai.chat.completions.create(
                    model=MODEL,
                    messages=[
                        {"role": "system", "content": "You are an expert in rating the relevance of social media posts about Black people in the U.S."},
                        user_msg
                    ],
                    temperature=0
                )
                output_text = response.choices[0].message.content.strip()

                results = safe_json_parse(output_text)
                if results:
                    all_scores.extend(results)
                    print(f"✅ Batch {batch_num} complete: {len(results)} scored")
                else:
                    print(f"⚠️ Batch {batch_num} returned no results")

                break  
            except Exception as e:
                print(f"⚠️ Batch {batch_num} failed: {e}")
                time.sleep(2)
        else:
            print(f"❌ Batch {batch_num} failed after retries")

    # Save to CSV
    pd.DataFrame(all_scores).to_csv(output_file, index=False)
    print(f"\n✅ Relevance scores saved to {output_file}")
    print(f"📊 Scored {len(all_scores)} posts out of {len(df)}")

score_relevance()


✅ Batch 1 complete: 26 scored
✅ Batch 2 complete: 30 scored
✅ Batch 3 complete: 30 scored
✅ Batch 4 complete: 30 scored
✅ Batch 5 complete: 29 scored
✅ Batch 6 complete: 30 scored
✅ Batch 7 complete: 30 scored
✅ Batch 8 complete: 28 scored
✅ Batch 9 complete: 30 scored
✅ Batch 10 complete: 30 scored
✅ Batch 11 complete: 30 scored
✅ Batch 12 complete: 30 scored
✅ Batch 13 complete: 23 scored
✅ Batch 14 complete: 30 scored
✅ Batch 15 complete: 30 scored
✅ Batch 16 complete: 26 scored
✅ Batch 17 complete: 30 scored
✅ Batch 18 complete: 28 scored
✅ Batch 19 complete: 30 scored
✅ Batch 20 complete: 30 scored
✅ Batch 21 complete: 30 scored
✅ Batch 22 complete: 30 scored
✅ Batch 23 complete: 30 scored
✅ Batch 24 complete: 30 scored
✅ Batch 25 complete: 30 scored
✅ Batch 26 complete: 27 scored
✅ Batch 27 complete: 30 scored
✅ Batch 28 complete: 30 scored
✅ Batch 29 complete: 30 scored
✅ Batch 30 complete: 30 scored
✅ Batch 31 complete: 30 scored
✅ Batch 32 complete: 30 scored
✅ Batch 33 comple

In [23]:
# relevancy scores for post_blm dataset
input_file = "post_blm_cleaned2_10k.csv"  
output_file = "post_blm_relevant_scores.csv"
TEXT_COLUMN = "post_body_text"
ID_COLUMN = "PostId"
BATCH_SIZE = 30  
MODEL = "gpt-4o-mini" 

df = pd.read_csv(input_file)
df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str)

all_scores = []  

# Parsing Json 
def safe_json_parse(text):
    try:
        return json.loads(text)
    except:
        matches = re.findall(r'\{[^}]+\}', text)
        results = []
        for m in matches:
            try:
                obj = json.loads(m + "}") if not m.endswith("}") else json.loads(m)
                results.append(obj)
            except:
                continue
        return results


# MAIN FUNCTION
def score_relevance():
    posts = df[[ID_COLUMN, TEXT_COLUMN]].dropna().to_dict("records")

    for start in range(0, len(posts), BATCH_SIZE):
        batch_num = start // BATCH_SIZE + 1
        batch = posts[start:start + BATCH_SIZE]

        formatted_posts = "\n".join([f"[{p[ID_COLUMN]}] {p[TEXT_COLUMN]}" for p in batch])

        user_msg = {
            "role": "user",
            "content": f"""
Rate each post from 1 to 5 for relevancy to discussions about Black people in the U.S.

A post is considered relevant if it: 
- Mentions Black people or African Americans in a cultural, political, historical, social, or identity context.
- Describes personal experiences of being Black, being treated differently for being Black, or interactions involving Black identity.
- References discrimination, prejudice, bias, stereotypes, or social movements involving Black people.
- Discusses achievements, challenges, victimization, leadership, activism, or community events involving Black people.
- Implies perceptions of Black people as hostile, nurturing, threatening, assertive, sexualized, competent, incompetent, victimized, or low in status — even if not explicitly stated.

Relevance scale:
1 = Not relevant at all (e.g., "Black Friday sale")
5 = Very relevant, explicit or implicit discourse about Black people, stereotypes, discrimination, representation

Return ONLY JSON in this exact format:
[
  {{"PostId": "123", "RelevanceScore": 3}},
  {{"PostId": "456", "RelevanceScore": 5}}
]

Posts:
{formatted_posts}
"""
        }

        for attempt in range(2):  # Retry once if it fails
            try:
                response = openai.chat.completions.create(
                    model=MODEL,
                    messages=[
                        {"role": "system", "content": "You are an expert in rating the relevance of social media posts about Black people in the U.S."},
                        user_msg
                    ],
                    temperature=0
                )
                output_text = response.choices[0].message.content.strip()

                results = safe_json_parse(output_text)
                if results:
                    all_scores.extend(results)
                    print(f"✅ Batch {batch_num} complete: {len(results)} scored")
                else:
                    print(f"⚠️ Batch {batch_num} returned no results")

                break  # Exit retry loop if successful
            except Exception as e:
                print(f"⚠️ Batch {batch_num} failed: {e}")
                time.sleep(2)
        else:
            print(f"❌ Batch {batch_num} failed after retries")

    pd.DataFrame(all_scores).to_csv(output_file, index=False)
    print(f"\n✅ Relevance scores saved to {output_file}")
    print(f"📊 Scored {len(all_scores)} posts out of {len(df)}")

score_relevance()



✅ Batch 1 complete: 29 scored
✅ Batch 2 complete: 30 scored
✅ Batch 3 complete: 30 scored
✅ Batch 4 complete: 30 scored
✅ Batch 5 complete: 30 scored
✅ Batch 6 complete: 30 scored
✅ Batch 7 complete: 30 scored
✅ Batch 8 complete: 30 scored
✅ Batch 9 complete: 30 scored
✅ Batch 10 complete: 28 scored
✅ Batch 11 complete: 30 scored
✅ Batch 12 complete: 30 scored
✅ Batch 13 complete: 30 scored
✅ Batch 14 complete: 30 scored
✅ Batch 15 complete: 30 scored
✅ Batch 16 complete: 30 scored
✅ Batch 17 complete: 30 scored
✅ Batch 18 complete: 30 scored
✅ Batch 19 complete: 30 scored
✅ Batch 20 complete: 24 scored
✅ Batch 21 complete: 30 scored
✅ Batch 22 complete: 27 scored
✅ Batch 23 complete: 30 scored
✅ Batch 24 complete: 30 scored
✅ Batch 25 complete: 26 scored
✅ Batch 26 complete: 30 scored
✅ Batch 27 complete: 30 scored
✅ Batch 28 complete: 30 scored
✅ Batch 29 complete: 30 scored
✅ Batch 30 complete: 29 scored
✅ Batch 31 complete: 30 scored
✅ Batch 32 complete: 30 scored
✅ Batch 33 comple