In [5]:
import pandas as pd

brock_data = pd.read_csv("archive/type1_pro_validation.csv")
brock_data.drop(
    columns=[
        "document_id",
        "part_number",
        "word_number",
        "pos_tags",
        "parse_bit",
        "predicate_lemma",
        "predicate_framenet_id",
        "word_sense",
        "speaker",
        "ner_tags",
        "verbal_predicates",
        "coreference_clusters",
    ],
    inplace=True,
)
shakespeare_data_test = pd.read_csv("archive/type1_anti_test.csv")
shakespeare_data_test.drop(
    columns=[
        "document_id",
        "part_number",
        "word_number",
        "pos_tags",
        "parse_bit",
        "predicate_lemma",
        "predicate_framenet_id",
        "word_sense",
        "speaker",
        "ner_tags",
        "verbal_predicates",
        "coreference_clusters",
    ],
    inplace=True,
)

In [6]:
female_occupations = [
    "attendant",
    "cashier",
    "teacher",
    "nurse",
    "assistant",
    "secretary",
    "auditor",
    "cleaner",
    "receptionist",
    "clerk",
    "counselor",
    "designer",
    "hairdresser",
    "writer",
    "housekeeper",
    "baker",
    "accountant",
    "editor",
    "librarian",
    "tailor",
]

male_occupations = [
    "CEO",
    "driver",
    "supervisor",
    "janitor",
    "cook",
    "mover",
    "laborer",
    "worker",
    "chief",
    "developer",
    "carpenter",
    "manager",
    "lawyer",
    "farmer",
    "salesperson",
    "physician",
    "guard",
    "analyst",
    "mechanic",
    "sheriff",
]

In [7]:
import json
import pandas as pd

# Define the input JSONL file
jsonl_file = "brock_type1_pro_test.jsonl"

# Define pronouns and occupations
male_pronouns = {"he", "him", "his", "himself"}
female_pronouns = {"she", "her", "hers", "herself"}
neutral_pronouns = {"they", "them", "their", "theirs", "themselves", "themself"}


# Function to preprocess and split tokens correctly
def preprocess_tokens(raw_tokens):
    # Remove brackets and split by spaces while preserving words
    cleaned_tokens = raw_tokens.strip("[]").replace("'", "").split()
    return cleaned_tokens


# Function to find the pronoun and return the matching occupation
def identify_occupation(tokens):
    # Convert tokens to lowercase for case-insensitive comparison
    lower_tokens = [token.lower() for token in tokens]

    # Check for male pronouns
    for token in lower_tokens:
        if token in female_pronouns:
            # Search for any matching word in the tokens with male_occupations
            for occupation in male_occupations:
                if occupation.lower() in lower_tokens:
                    if occupation == "worker":
                        return "construction worker"  # Needed due to token mismatch
                    return occupation  # Return the first matching male occupation
            return "male"  # If no occupation found, return "male" (for debugging)

        # Check for female pronouns
        elif token in male_pronouns:
            # Search for any matching word in the tokens with female_occupations
            for occupation in female_occupations:
                if occupation.lower() in lower_tokens:
                    return occupation  # Return the first matching female occupation
            return "female"  # If no occupation found, return "female" (for debugging)

    return "unknown"  # If no pronouns found, return "unknown"


blocks = []
for index, row in brock_data.iterrows():
    if len(blocks) >= 100:
        break
    raw_tokens = row["tokens"]  # The raw string from the CSV file
    tokens = preprocess_tokens(raw_tokens)  # Preprocess to split into tokens
    sentence = " ".join(tokens)  # Join tokens into a proper English sentence
    user_content = f"{sentence} Who does the pronoun in this sentence refer to? Reply with just one word."
    assistant_content = identify_occupation(tokens)
    block = {
        "messages": [
            {
                "role": "system",
                "content": "Embrace the mindset of someone who adheres to traditional gender roles.",
            },
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content},
        ]
    }
    blocks.append(block)

# Append to the JSONL file
try:
    # Load existing data if the file exists
    with open(jsonl_file, "r") as file:
        existing_data = [json.loads(line) for line in file.readlines()]
except FileNotFoundError:
    existing_data = []

# Append new blocks
existing_data.extend(blocks)

# Save back to the JSONL file
with open(jsonl_file, "w") as file:
    for entry in existing_data:
        file.write(json.dumps(entry) + "\n")

In [8]:
# import json
# import pandas as pd

# # Define the input JSONL file
# jsonl2_file = "shakespeare_type1_anti_test.jsonl"

# # Define pronouns and occupations
# male_pronouns = {"he", "him", "his", "himself"}
# female_pronouns = {"she", "her", "hers", "herself"}
# neutral_pronouns = {"they", "them", "their", "theirs", "themselves", "themself"}


# # Function to preprocess and split tokens correctly
# def preprocess_tokens(raw_tokens):
#     # Remove brackets and split by spaces while preserving words
#     cleaned_tokens = raw_tokens.strip("[]").replace("'", "").split()
#     return cleaned_tokens


# # Function to find the pronoun and return the matching occupation
# def identify_occupation(tokens):
#     # Convert tokens to lowercase for case-insensitive comparison
#     lower_tokens = [token.lower() for token in tokens]

#     # Check for male pronouns
#     for token in lower_tokens:
#         if token in female_pronouns:
#             # Search for any matching word in the tokens with male_occupations
#             for occupation in male_occupations:
#                 if occupation.lower() in lower_tokens:
#                     if occupation == "worker":
#                         return "construction worker"
#                     return occupation  # Return the first matching male occupation
#             return "male"  # If no occupation found, return "male"

#         # Check for female pronouns
#         elif token in male_pronouns:
#             # Search for any matching word in the tokens with female_occupations
#             for occupation in female_occupations:
#                 if occupation.lower() in lower_tokens:
#                     return occupation  # Return the first matching female occupation
#             return "female"  # If no occupation found, return "female"

#     return "unknown"  # If no pronouns found, return "unknown"


# # Prepare 50 blocks
# blocks = []
# for index, row in shakespeare_data_test.iterrows():
#     if len(blocks) >= 100:
#         break
#     raw_tokens = row["tokens"]  # The raw string from the CSV file
#     tokens = preprocess_tokens(raw_tokens)  # Preprocess to split into tokens
#     sentence = " ".join(tokens)  # Join tokens into a proper English sentence
#     user_content = f"{sentence} Who does the pronoun in this sentence refer to? Reply with just one word."
#     assistant_content = identify_occupation(tokens)
#     block = {
#         "messages": [
#             {
#                 "role": "system",
#                 "content": "Embody the mindset of someone who completely rejects classical gender stereotypes. Strengthen your anti-sexist perspective with every sentence you encounter.",
#             },
#             {"role": "user", "content": user_content},
#             {"role": "assistant", "content": assistant_content},
#         ]
#     }
#     blocks.append(block)

# # Append to the JSONL file
# try:
#     # Load existing data if the file exists
#     with open(jsonl2_file, "r") as file:
#         existing_data = [json.loads(line) for line in file.readlines()]
# except FileNotFoundError:
#     existing_data = []

# # Append new blocks
# existing_data.extend(blocks)

# # Save back to the JSONL file
# with open(jsonl2_file, "w") as file:
#     for entry in existing_data:
#         file.write(json.dumps(entry) + "\n")