In [1]:
import json
import pandas as pd
from pathlib import Path


In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
filtered_stereotypes = pd.read_csv("../filtered_stereotypes.csv")
len(filtered_stereotypes)

3989

In [4]:
checkpoint_path = Path("../outputs/linguistic_indicators_llama3.3")
all_results = []

for json_file in sorted(checkpoint_path.glob("batch_*.json")):
    with open(json_file, 'r') as f:
        results = json.load(f)
    
    all_results.extend(results)

In [5]:
len(all_results)

3989

In [6]:
na_checkpoint_path = Path("../outputs/linguistic_indicators_llama3.3/na_rows_llama3.3")

for json_file in sorted(na_checkpoint_path.glob("batch_*.json")):
    with open(json_file, 'r') as f:
        results = json.load(f)
    
    all_results.extend(results)

In [7]:
def has_nested_key(d):
    return "full_label" in d.get("output", {})

all_results.sort(key=has_nested_key, reverse=False)

all_results = list({d["sentence"]: d for d in all_results}.values())

In [8]:
output_file = checkpoint_path / "all_results.json"

with open(output_file, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"✓ Compiled {len(all_results)} total results to {output_file}")

✓ Compiled 3958 total results to ../outputs/linguistic_indicators_llama3.3/all_results.json


In [9]:
indicators_df = filtered_stereotypes.copy()
llama_results = Path("../outputs/linguistic_indicators_llama3.3/all_results.json")

with open(llama_results, 'r') as f:
    results = json.load(f)

# Create a mapping from sentence to results
results_map = {r['sentence']: r['output'] for r in results}

# Add new columns for each output field
output_fields = ["has_category_label", "full_label", "target_type", "connotation", "gram_form", "ling_form", "information", "situation", "situation_evaluation", "generalization"]

for field in output_fields:
    indicators_df[field] = indicators_df['text'].map(lambda x: results_map.get(x, {}).get(field, None))

print(indicators_df.info())
indicators_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3989 entries, 0 to 3988
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   text                      3989 non-null   object
 1   has_category_label        3989 non-null   object
 2   full_label                3989 non-null   object
 3   beliefs_expectancies      3989 non-null   object
 4   information               3989 non-null   object
 5   behavior_features_traits  3989 non-null   object
 6   stereotype                3989 non-null   object
 7   target_type               3989 non-null   object
 8   connotation               3989 non-null   object
 9   gram_form                 3989 non-null   object
 10  ling_form                 3989 non-null   object
 11  situation                 3989 non-null   object
 12  situation_evaluation      3989 non-null   object
 13  generalization            3989 non-null   object
dtypes: object(14)
memory usa

Unnamed: 0,text,has_category_label,full_label,beliefs_expectancies,information,behavior_features_traits,stereotype,target_type,connotation,gram_form,ling_form,situation,situation_evaluation,generalization
0,Some IT work has been outsourced to workers in Bangladesh. It is a poor country and the people live in squalor.,yes,the people,yes,live in squalor,yes,yes,generic target,neutral,noun,generic,enduring characteristics,negative,concrete
1,Bangladesh exports a lot of garments. You figure they would since nobody there can read or write.,yes,nobody,yes,can read or write,yes,yes,generic target,neutral,noun,generic,enduring characteristics,negative,abstract
2,Bangladesh is a country few people know about. It is very poor and uneducated there.,yes,people,yes,not-applicable,yes,yes,generic target,neutral,noun,generic,not-applicable,not-applicable,not-applicable


In [None]:
# indicators_df[indicators_df.isna().any(axis=1)].to_csv("../outputs/linguistic_indicators_llama3.3/all_na_rows.csv", index=True)

In [12]:
indicators_df.to_csv("../stereotype_indicators.csv", index=False)