In [83]:
# TODO: update to the new data format, which simplifies things!
# TODO: update this from notebook to semi-interactive script

import nltk # type: ignore
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer # type: ignore
import csv
import re
 
wnl = WordNetLemmatizer()

def story_text_cleaned(text):
    if text:  # Check if the text is not None or NaN
        match = re.search(r"---\s*(.*?)\s*---", text, re.S)
        if match:
            return match.group(1).strip()
    return text  # Return the original text if no match is found

def lemmatize_word(word: str) -> str:
    word = word.replace("bravery", "brave").replace("haired", "hair")
    return wnl.lemmatize(word)

def lemmatize_text(text: str) -> str:
    text = nltk.word_tokenize(text.lower())
    return " ".join([lemmatize_word(word) for word in text])

def process_data(data):
    attribute_words = {x for line in data for x in lemmatize_text(line["protagonist_attributes"]).split(" ") if x} | {"boy", "son", "child", "girl", "daughter"}
    for line in data:
        line["story_text_cleaned"] = story_text_cleaned(line["story_text"])
        line["story_text_lemmas"] = lemmatize_text(line["story_text_cleaned"])
        line["story_text_lemmas"] = " ".join([x for x in line["story_text_lemmas"].split(" ") if x not in attribute_words])

        if "protagonist_attributes_list" in line:
            words = line["protagonist_attributes_list"].strip("[]'").lower().split("', '")
            words = [lemmatize_word(word) for word in words]
            line["protagonist_attributes"] = " ".join(words)
        else:
            line["protagonist_attributes"] = lemmatize_text(line["protagonist_attributes"])

    for line in data:
        if "race_of_parent" in line:
            line["race_of_parent"] = line["race_of_parent"].replace("African-American", "African-Amer.").replace("European-American", "European-Amer.")
            
        if "nationality_parent" not in line:
            continue
        x = line["nationality_parent"]
        line["nationality_parent_group"] = (
            "North American" if x in {"American", "Canadian"} else
            "South American" if x in {"Mexican", "Brazilian"} else
            "European" if x in {"British", "German", "French", "Italian", "Russian"} else
            "Middle Eastern" if x in {"Armenian", "Afghan", "Azerbaijani", "Egyptian", "Iranian", "Iraqi"} else
            "Africa" if x in {"Ethiopian", "Kenyan", "Malian", "Nigerian", "South African", "Sudanese"} else
            "Asia" if x in {"Chinese", "Filipino", "Indian", "Indonesian", "Japanese", "Sri Lankan", "Tajik", "Thai", "Vietnamese"} else
            "Other"
        )
        line["nationality_parent_developed"] = (
            "Developed" if x in {"American", "British", "Canadian", "French", "German", "Italian", "Japanese", "Russian"} else
            "Developing" if x in {"Afghan", "Armenian", "Azerbaijani", "Brazilian", "Chinese", "Egyptian", "Ethiopian", "Filipino", "Indian", "Indonesian", "Iranian", "Iraqi", "Kenyan", "Malian", "Mexican", "Nigerian", "South African", "Sri Lankan", "Sudanese", "Tajik", "Thai", "Vietnamese"} else
            "Other"
        )
    return data

data = {
    "exp2": process_data(list(csv.DictReader(open('data/exp2-all_raw_data.csv', 'r')))),
    # race_of_parent
    "exp3": process_data(list(csv.DictReader(open('data/exp3-gpt4-categorized.csv', 'r')))),
    # religion_of_parent
    "exp4": process_data(list(csv.DictReader(open('data/exp4-gpt4-categorized.csv', 'r')))),
}

In [105]:
import collections
import sklearn.feature_extraction.text
import sklearn.linear_model
import sklearn.model_selection
import numpy as np
import scipy.stats

def corr_bias_table(in_variable: str="story_text_lemmas", out_variable: str="gender_of_child", examples_per_class: int = 6):
    if out_variable == "race_of_parent":
        data_local = data["exp3"]
    elif out_variable == "religion_of_parent":
        data_local = data["exp4"]
    else:
        data_local = data["exp2"]

    vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        max_features=100,
        min_df=5,
        # these stop words include he/she/they/etc
        stop_words='english',
    )
    data_x = np.array(vectorizer.fit_transform([d[in_variable] for d in data_local]).toarray() > 0, dtype=int)
    # data_x = vectorizer.fit_transform([d['story_text_lemmas'] for d in data_local]).toarray()
    data_y = np.array([d[out_variable] for d in data_local])
    data_y_targets = sorted(set(data_y))

    features_by_class = collections.defaultdict(list)
    for feature_i, feature in enumerate(vectorizer.get_feature_names_out()):
        data_x_feature = data_x[:, feature_i].flatten()
        for target_i, target in enumerate(data_y_targets):
            data_y_features = data_y == target
            corr = scipy.stats.pearsonr(data_x_feature, data_y_features).correlation
            features_by_class[target].append((feature, corr))

    with open(f"generated/corr_{in_variable}_{out_variable}.tex", "w") as f:

        print(r"\null\\[-2.5em]", file=f)
        title = out_variable.replace("_", " ").title()
        print(f"\\multicolumn{{2}}{{l}}{{\\bf {title}}} \\\\\n", file=f)
        # print(f"{title} \\\\\n", file=f)
        for target, target_v in features_by_class.items():
            top_correlating = sorted(target_v, key=lambda x: x[1], reverse=True)[:examples_per_class]
            examples = ""
            for i, (feature, corr) in enumerate(top_correlating):
                if i == examples_per_class - 1:
                    examples += f"{{ {{\\fontsize{{7}}{{7}}\\selectfont {corr:.0%}}} {feature} }}".replace("+-", "-").replace("%", "\\%") + " "
                else:
                    examples += f"\\makebox[22mm][l]{{ {{\\fontsize{{7}}{{7}}\\selectfont {corr:.0%}}} {feature} }}".replace("+-", "-").replace("%", "\\%") + " "
            
            print(
                target,
                examples,
                sep=" & ",
                end="\\\\\n",
                file=f,
            )
        print("\\bottomrule\\\\", file=f)

In [106]:
corr_bias_table(in_variable="story_text_lemmas", out_variable="gender_of_child")
corr_bias_table(in_variable="story_text_lemmas", out_variable="nationality_parent_developed")
corr_bias_table(in_variable="story_text_lemmas", out_variable="nationality_parent_group")
corr_bias_table(in_variable="story_text_lemmas", out_variable="race_of_parent")
corr_bias_table(in_variable="story_text_lemmas", out_variable="religion_of_parent")
corr_bias_table(in_variable="story_text_lemmas", out_variable="role_of_parent")

In [107]:
corr_bias_table(in_variable="protagonist_attributes", out_variable="gender_of_child")
corr_bias_table(in_variable="protagonist_attributes", out_variable="nationality_parent_developed")
corr_bias_table(in_variable="protagonist_attributes", out_variable="nationality_parent_group")
corr_bias_table(in_variable="protagonist_attributes", out_variable="race_of_parent")
corr_bias_table(in_variable="protagonist_attributes", out_variable="religion_of_parent")
corr_bias_table(in_variable="protagonist_attributes", out_variable="role_of_parent")

In [108]:
corr_bias_table(in_variable="protagonist_attributes", out_variable="model")
corr_bias_table(in_variable="story_text_lemmas", out_variable="model")