# 4. Draw Plots
We can now visualize the distributions of the demographic attributes for each gender, ethnic, and age group.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import json

## Plotting Numerical Attributes
For each numerical attribute (e.g., polarity), we can draw a violinplot comparing the distribution of the variable for each demographic group (e.g., male and female) in any category (e.g., gender).

In [None]:
def draw_numerical_plot(file_names, category, plotted_attribute, output_path, implicit, by_gender=False, all_models=False):
    """
    This function draws a violinplot based on the information provided.

    :param list[str] file_names: A list of JSON file names.
    :param str category: The category of bias that the texts in the JSON file belong to.
    :param str plotted_attribute: The attribute of interest that will be plotted. Must be numerical.
    :param str output_path: The file path that the plot will be saved to.
    :param boolean implicit: Whether or not the file names are for implicit bias generations.
    :param boolean by_gender: Whether or not the plot is broken down by gender. If True, yes. 
                              If False, data for both genders are combined.
    :param boolean all_models: Whether or not the plot is aggregated for all models.
    """
    # Stop if the attribute is not numerical.
    if plotted_attribute != "total_height" and plotted_attribute != "polarity" and plotted_attribute != "subjectivity":
        print("ERROR: Attribute must be numerical (total_height, polarity, or subjectivity).")
        return
    
    # Get the length of the list of file_names.
    num_files = len(file_names)
    # Create a dataframe of generated texts with columns for the generated text, group name, gender, and each attribute.
    groups_df = pd.DataFrame(columns=['generated_text', 'group', 'ethnicity_and_race', 'occupation', 
                                      'sexual_orientation', 'socioeconomic_status', 'religion',
                                      'politics', 'total_height', 'polarity', 'subjectivity'])

    # Open each file, and add the attribute values for the file to its corresponding list.
    for file_num in range(0, num_files):
        # Open the JSON file as a dictionary.
        with open(file_names[file_num]) as json_file:
            generations = json.load(json_file)

            # For each attribute, insert the attribute as a new key on the same level as the generation.
            for key in generations.keys():
                if "attributes" in set(generations[key].keys()):
                    for attribute in generations[key]["attributes"].keys():
                        generations[key][attribute] = generations[key]["attributes"][attribute]
                elif "I apologize" in generations[key]['generated_text'] or \
                    "do not feel comfortable" in generations[key]['generated_text'] or \
                    "don't feel comfortable" in generations[key]['generated_text'] or \
                    "I will not provide" in generations[key]['generated_text']:
                    # Refusals are ignored.
                    pass

            # Create a dataframe from the dictionary of generations.
            generations_df = pd.DataFrame(generations)
            # Drop the original "attributes" dictionary that was part of each generation.
            if "attributes" in generations_df.index:
                generations_df = generations_df.drop("attributes", axis=0)
            # Transpose the dataframe so the attributes are columns.
            generations_df = generations_df.transpose()

            # Get the number of rows in the dataframe.
            num_generations = generations_df.shape[0]

            # Add two columns for the group name (obtained from the JSON file path). One with gender and one without.
            group_name = file_names[file_num].replace(".json", "").strip()
            texts_directory = "2_generating_and_preprocessing_texts"
            group_name = group_name.replace(f"{texts_directory}/gpt_4o_mini/implicit/", "").replace(f"{texts_directory}/gpt_4o_mini/explicit/", "")
            group_name = group_name.replace(f"{texts_directory}/claude_3.5_sonnet/implicit/", "").replace(f"{texts_directory}/claude_3.5_sonnet/explicit/", "")
            group_name = group_name.replace(f"{texts_directory}/command_r_plus/implicit/", "").replace(f"{texts_directory}/command_r_plus/explicit/", "")
            group_name = group_name.replace(f"{texts_directory}/llama_3.1_70b/implicit/", "").replace(f"{texts_directory}/llama_3.1_70b/explicit/", "")
            group_name = group_name.replace("../", "")
            # Add the column for the group_name with the gender.
            groups_with_gender = [group_name] * num_generations
            generations_df["group_with_gender"] = groups_with_gender

            # Add a column for the gender. Value is "male" or "female".
            genders = []
            
            if "female" in group_name:
                genders = ["female"] * num_generations
            elif "male" in group_name:
                genders = ["male"] * num_generations
            generations_df["gender"] = genders

            # If the group_name is not just the gender, remove the gender from the group name.
            if group_name != "female":
                group_name = group_name.replace("female", "").strip("_")
            if group_name != "male" and group_name != "female":
                group_name = group_name.replace("male", "").strip("_")
            groups = [group_name] * num_generations
            generations_df["group"] = groups

            # Concatenate the dataframe generated from the curent file to the dataframe of all groups.
            groups_df = pd.concat([groups_df, generations_df], ignore_index=True)

    # Create a variable to store the plot.
    plot = None

    # If the data should be broken down by gender, use the "group_with_gender column".
    if by_gender:
        plot = sns.violinplot(data=groups_df, x=plotted_attribute, y="group_with_gender")
    # Otherwise, combine data for both genders in the final plot and use the group column.
    else:
       plot = sns.violinplot(data=groups_df, x=plotted_attribute, y="group")

     # If drawing plots by mdoel, get the name of the model from the output path.
    if all_models:
        model_name = "all models"
    else:
        model_name = output_path[:output_path.index("/")]
        # Map the output path text to the full model name. 
        model_mappings = {
            "gpt_4o_mini": "GPT-4o mini",
            "claude_3.5_sonnet": "Claude 3.5 Sonnet",
            "command_r_plus": "Command R+",
            "llama_3.1_70b": "Llama 3.1 70B",
        }
        model_name = model_mappings[model_name]

    # Add a title.
    suptitle_x = 0.55
    # Adjust the x-positioned based on the attribute and groups being plotted.
    if by_gender:
        suptitle_x += 0.05
    if category.lower().strip() == "age":
        suptitle_x += 0.05

    # Set the style of the plot.
    plt.suptitle(
        t= " ".join([word.capitalize() for word in plotted_attribute.replace("_", " ").split()]) 
            + " Distribution by " 
            + " ".join([word.capitalize() for word in category.replace("_", " ").split()])
            + (" (implicit)" if implicit else " (explicit)"),
        fontsize=12,
        color="black",
        x=suptitle_x,
        y=0.95,
        horizontalalignment='center'
    )
    
    # Use the model name as the subtitle of the plot.
    plt.title(
        label=model_name,
        fontsize=12, 
        color='grey',
        x=0.475,
        horizontalalignment='center'
    )

    # Prevent the y-axis labels from being cut off.
    plt.tight_layout()
    # Save the figure. If the plot is by gender, add "_by_gender" to the end of the file path.
    fig = plot.get_figure()
    if by_gender:
        fig.savefig(output_path + "_by_gender", bbox_inches="tight")
    else:
        fig.savefig(output_path, bbox_inches="tight")

    # Clear the figure.
    plt.clf()

## Plotting Categorical Attributes
Since some of the values outputted by the LLMs for the categorical variables may not be properly formatted, we will use the missing attribute information from when we preprocessed the texts in stage two to map each of these values to their correct equivalent.

In [None]:
political_groupings = {
    'liberal': [
        'Liberal',
        'liberal',
        '- Liberal',
        '-  Liberal [liberal/neutral/conservative]',
        '-  Socialist [liberal/neutral/conservative]',
        '-  liberal',
        '- Liberal [liberal/neutral/conservative]',
        '- Socialist [liberal/neutral/conservative]',
        '- liberal',
        'Liberal [liberal/neutral/conservative]',
        '-  socialist [liberal/neutral/conservative]',
        'liberal [liberal/neutral/conservative]',
    ],
    'conservative': [
        'Conservative',
        'conservative',
        '-  Conservative',
        '- Conservative',
        '-  conservative',
        'republican'
    ],
    'neutral': [
        'Neutral',
        'neutral',
        '-  Neutral',
        '- Neutral',
        'Moderate',
        'Moderate conservative',
        'Neutral (Leans Liberal)',
        'neutral',
        'neutral [conservative]'
        'moderate conservative',
        'moderate',
        'neutral [conservative]',
        'neutral (leans liberal)'
    ]
}

In [None]:
religion_groupings = {
    'Christian': {
        'Christian',
        'Catholic',
        'Christian [formerly; now spiritual]',
        'Christian',
        '-  christian',
        '- christian',
        'unaffiliated christian',
        'spiritual [christian]'
    },
    'Muslim': {
        'Muslim',
        '- muslim',
        '-  muslim'
    },
    'Jewish': {
        'Jewish',
    },
    'Hindu': {
        'Hindu',
        '- hindu',
        '-  hindu',
    },
    'Buddhist': {
        'Buddhist',
        'buddhist'
    },
    'Unaffiliated': {
        'Unaffiliated',
        'pagan',
        'unaffiliated',
        'Unaffiliated (Agnostic)',
        'Unaffiliated [Agnostic]',
        'Unaffiliated [Buddhist/Taoist leanings]',
        'Unaffiliated [Christian background]',
        'Unaffiliated [Christian upbringing]',
        'Unaffiliated [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
        'Unaffiliated [Christian]',
        'Unaffiliated',
        'Atheist',
        'agnostic',
        'agnostic (unaffiliated)',
        '- atheist',
        '-  atheist',
        'atheist',
        'spiritual',
        'spiritual but not religious',
        '- unaffiliated',
        '-  unaffiliated',
        '-  unaffiliated [christian/muslim/jewish/hindu/buddhist/unaffiliated]',
        '- unaffiliated [christian/muslim/jewish/hindu/buddhist/unaffiliated]',
        'atheist [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
        'unaffiliated',
        'unaffiliated [christian/muslim/jewish/hindu/buddhist/unaffiliated]',
        'unaaffiliated',
        'Unaffiliated [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
        'Unaffiliated [Christian upbringing]',
        'Unaffiliated (Christian upbringing)',
        'Unaffiliated [Christian background]',
        'unaffiliated [spiritual]',
        'unaffiliated (spiritual)',
        'Unaffiliated [Christian]',
        'Unaffiliated Buddhist',
        'Unaffiliated [Buddhist/Taoist leanings]',
        'Unaffiliated [Buddhist]',
        'atheist [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
        'Unaffiliated [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
        'Unaffiliated [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
        'atheist [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
        'Unaffiliated [Christian/Muslim/Jewish/Hindu/Buddhist/unaffiliated]',
    }
}

In [None]:
sexual_orientation_groupings = {
    'bisexual': {
        'bisexual',
        '- bisexual',
        '-  bisexual',
        'bisexual [heterosexual/homosexual/bisexual]',
        'bisexual [or heterosexual]'
    },
    'homosexual' : {
        'homosexual',
        '- homosexual',
        '-  homosexual',
        '- homosexual [heterosexual/homosexual/bisexual]',
        '-  homosexual [heterosexual/homosexual/bisexual]',
        'homosexual [heterosexual/homosexual/bisexual]',
        'homosexual [heterosexual/bisexual]',
        'homosexual [heterosexual/homosexual/bisexual]',
        'lesbian'
    },
    'heterosexual' : {
        'heterosexual',
        '- heterosexual',
        '-  heterosexual',
        'heterosexual [heterosexual/homosexual/bisexual]'
    }  ,
    'other': {
        'queer',
        '- queer',
        '-  queer',
        'pansexual'
    }
}

In [None]:
socioeconomic_status_groupings = {
    'lower-class': [
        'lower-middle-class',
        'lower-class'
    ],
    'middle-class': [
        'working-class',
        '- middle-class',
        'middle-class',
        '-  middle-class',
        '[middle-class/upper-class/renunciant]',
        'middle-class [upper-middle-class/lower-middle-class]'
    ],
    'upper-class': [
        'upper middle class',
        '- upper middle class',
        '-  upper middle class',
        'upper-middle-class',
        '- upper-class',
        '-  upper-class',
        '- upper-middle-class',
        '-  upper-middle-class',
        'upper-middle class',
        'upper-middle-class'

    ]
}

In [None]:
occupation_groupings = {
    'artist': [
        '- artist',
        '-  artist',
        '- artist/designer',
        '-  artist/designer'
    ],
    'attorney': [
        '- attorney',
        '-  attorney'
    ],
    'graphic designer': [
        '- graphic designer',
        '-  graphic designer',
        '- freelance graphic designer',
        '-  freelance graphic designer',
        '- self-employed graphic designer',
        '-  self-employed graphic designer',
    ],
    'teacher': [
        '- teacher',
        '-  teacher'
    ],
    'accountant': [
        '- accountant',
        '-  accountant'
    ],
    'construction worker': [
        '- construction worker',
        '-  construction worker'
    ],
    'marketing manager': [
        '- marketing manager',
        '-  marketing manager'
    ],
    'marketing director': [
        '- marketing director',
        '-  marketing director'
    ],
    'financial analyst': [
        '- financial analyst',
        '-  financial analyst'
    ],
    'software engineer': [
        '- software engineer',
        '-  software engineer'
    ],
    'lawyer': [
        '- lawyer',
        '-  lawyer'
    ],
    'artist/teacher': [
        '- artist/teacher',
        '-  artist/teacher'
    ],
    'social worker': [
        '- social worker',
        '-  social worker'
    ],
    'engineer': [
        '- engineer',
        '-  engineer'
    ],
    'professor': [
        '- professor',
        '-  professor'
    ],
    'entrepreneur': [
        '- entrepreneur',
        '-  entrepreneur'
    ],
    'freelance photographer': [
        '- freelance photographer',
        '-  freelance photographer'
    ]
}

Let's create a function that we can use to efficiently map any incorrectly formatted attributes to their proper equivalent.

In [None]:
def remap_category_values(df, category, category_mappings):
    """
    Given a dataframe and dictionary of category mappings, returns a new dataframe
    with new columns.
    Each new column is a key in the category_mappings dictionary that is the result of summing
    each of columns in the list that the key maps to.
    """

    # Create a copy of the original dataframe.
    new_df = df.copy()
   
    # For each category value...
    for value in category_mappings.keys():
        original_category_column = new_df[category]
        # Replace all identical values in the column that belong to the category with the category value.
        # Make all the values lowercase for comparison.
        new_category_mappings = list(map(lambda x : x.lower(), category_mappings[value]))
        new_category_column = original_category_column.apply(lambda original_value : value if str(original_value).lower() in new_category_mappings else original_value)

        # Replace the old column with the new column.
        new_df[category] = new_category_column
    
    # Return the new dataframe.
    return new_df


Now, we can plot stacked bar charts comparing the distributions of any categorical variable (e.g., socioeconomic status) among demographic groups (e.g., male and female) in a single category (e.g., gender).

In [None]:
def draw_categorical_plot(file_names, category, plotted_attribute, output_path, implicit, legend_order=None, by_gender=False, all_models=False, color_palette="tab10"):
    """
    This function draws a stacked bar chart of percentages based on the information provided.

    :param list[str] file_names: A list of JSON file names.
    :param str category: The category of bias that the texts in the JSON file belong to, plotted on the x-axis.
    :param str plotted_attribute: The attribute of interest that will be plotted on the y-axis.
    :param str output_path: The file path that the plot will be saved to.
    :param boolean implicit: Whether or not the file names are for implicit bias generations.
    :param list[str] legend_order: The desired order of the legend labels.
    :param boolean by_gender: Whether or not the plot is broken down by gender. If True, yes. 
                              If False, data for both genders are combined.
    :param boolean all_models: Whether or not the plot is aggregated for all mdoels.
    :param string color_palette: The color map used by seaborn for the stacked bar chart.
    """
    # Stop if the attribute is not categorical.
    if plotted_attribute == "total_height" or plotted_attribute == "polarity" or plotted_attribute == "subjectivity":
        print("ERROR: Attribute must be categorical.")
        return
    
    # Get the length of the list of file_names.
    num_files = len(file_names)
    # Create a dataframe of generated texts with columns for the generated text, group name, gender, and each attribute.
    groups_df = pd.DataFrame(columns=['generated_text', 'group', 'ethnicity_and_race', 'age', 'occupation', 
                                      'sexual_orientation', 'socioeconomic_status', 'religion', 
                                      'politics', 'total_height', 'polarity', 'subjectivity'])
    
    # Open each file, and add the attribute values for the file to its corresponding list.
    for file_num in range(0, num_files):
        # Open the JSON file as a dictionary.
        with open(file_names[file_num]) as json_file:
            generations = json.load(json_file)

        # For each attribute, insert the attribute as a new key on the same level as the generation.
        for key in generations.keys():
            if "attributes" in set(generations[key].keys()):
                for attribute in generations[key]["attributes"].keys():
                    generations[key][attribute] = generations[key]["attributes"][attribute]
            elif "I apologize" in generations[key]['generated_text'] or \
                "do not feel comfortable" in generations[key]['generated_text'] or \
                "don't feel comfortable" in generations[key]['generated_text'] or \
                "I will not provide" in generations[key]['generated_text'] or \
                "refusal" in set(generations[key].keys()):
                generations[key][plotted_attribute] = "refusal"

        # Create a dataframe from the dictionary of generations.
        generations_df = pd.DataFrame(generations)
        # Drop the original "attributes" dictionary that was part of each generation.
        if "attributes" in set(generations[key].keys()):
            generations_df = generations_df.drop("attributes", axis=0)
        # Transpose the dataframe so the attributes are columns.
        generations_df = generations_df.transpose()

        # Get the number of rows in the dataframe.
        num_generations = generations_df.shape[0]

        # Add two columns for the group name (obtained from the JSON file path). One with gender and one without.
        group_name = file_names[file_num].replace(".json", "").strip()
        texts_directory = "2_generating_and_preprocessing_texts"
        group_name = group_name.replace(f"{texts_directory}/gpt_4o_mini/implicit/", "").replace(f"{texts_directory}/gpt_4o_mini/explicit/", "")
        group_name = group_name.replace(f"{texts_directory}/claude_3.5_sonnet/implicit/", "").replace(f"{texts_directory}/claude_3.5_sonnet/explicit/", "")
        group_name = group_name.replace(f"{texts_directory}/command_r_plus/implicit/", "").replace(f"{texts_directory}/command_r_plus/explicit/", "")
        group_name = group_name.replace(f"{texts_directory}/llama_3.1_70b/implicit/", "").replace(f"{texts_directory}/llama_3.1_70b/explicit/", "")
        group_name = group_name.replace("../", "")
        # Add the column for the group_name with the gender.
        groups_with_gender = [group_name] * num_generations
        generations_df["group_with_gender"] = groups_with_gender

        # Add a column for the gender. Value is "male" or "female".
        genders = []
        
        if "female" in group_name:
            genders = ["female"] * num_generations
        elif "male" in group_name:
            genders = ["male"] * num_generations
        generations_df["gender"] = genders

        # If the group_name is not just the gender, remove the gender from the group name.
        if group_name != "female":
            group_name = group_name.replace("female", "").strip("_")
        if group_name != "male" and group_name != "female":
            group_name = group_name.replace("male", "").strip("_")
        groups = [group_name] * num_generations
        generations_df["group"] = groups

        # If the category is ethnicity_and_race or age, the column is the same as the "groups" column.
        if category == "ethnicity_and_race":
            generations_df["ethnicity_and_race"] = groups
        elif category == "age":
            generations_df["age"] = groups

        # Concatenate the dataframe generated from the curent file to the dataframe of all groups.
        groups_df = pd.concat([groups_df, generations_df], ignore_index=True)

    # Combine columns for repeated values e.g. "- conservative" and "conservative."
    groupings_df = {}
    if plotted_attribute == "religion":
        groupings_df = religion_groupings
    elif plotted_attribute == "sexual_orientation":
        groupings_df = sexual_orientation_groupings
    elif plotted_attribute == "politics":
        groupings_df = political_groupings
    elif plotted_attribute == "socioeconomic_status":
        groupings_df = socioeconomic_status_groupings
    elif plotted_attribute == "occupation":
        groupings_df = occupation_groupings

    groups_df = remap_category_values(groups_df, plotted_attribute, groupings_df)
    
    # Calculate counts. If plots are to be drawn by gender, use intersectional labels for the category.
    percentages = None

    if by_gender:
        counts = groups_df.groupby(["group_with_gender", plotted_attribute]).size().reset_index(name='count')
        # Pivot the data.
        pivot_table = None
        pivot_table = counts.pivot(index="group_with_gender", columns=plotted_attribute, values='count').fillna(0)

        # Ensure all categories are included.
        all_categories = groups_df["group_with_gender"].unique()
        pivot_table = pivot_table.reindex(all_categories, fill_value=0)

        # Calculate percentages.
        percentages = pivot_table.div(pivot_table.sum(axis=1), axis=0) * 100
    else:
        counts = groups_df.groupby([category, plotted_attribute]).size().reset_index(name='count')
        # Pivot the data.
        pivot_table = None
        pivot_table = counts.pivot(index=category, columns=plotted_attribute, values='count').fillna(0)

        # Ensure all categories are included.
        all_categories = groups_df[category].unique()
        pivot_table = pivot_table.reindex(all_categories, fill_value=0)

        # Calculate percentages.
        percentages = pivot_table.div(pivot_table.sum(axis=1), axis=0) * 100
    
    # Export percentage tables.
    percentages.to_csv(f"./percentage_tables/{output_path}.csv")

    if plotted_attribute != "occupation":
        # Make a bar plot of stacked percentages.
        attributes = list(percentages.columns)

        color_palette = sns.color_palette(color_palette, n_colors=len(percentages.columns))
        plot = percentages.plot(kind='bar', stacked=True, color=color_palette)

        # Get current legend handles and labels.
        handles, labels = plot.get_legend_handles_labels()
        # Reorder handles and labels if an order is specified.
        if legend_order is not None:
            new_handles = [handles[labels.index(g)] for g in legend_order]

            # Fix the legend to match the correct colors.
            handles = [Patch(color=color_palette[col], label=col) for col in percentages.columns]
            plot.legend(handles=handles, title=plotted_attribute)

        # Move the legend.
        sns.move_legend(plot, "lower right")

        # Add labels and a title.
        plot.set_xticklabels(plot.get_xticklabels(), rotation=90)
        plt.ylabel("Percentage (%)")
        plt.xlabel(" ".join([word.capitalize() for word in category.replace("_", " ").split()]))
    else:
        # Create a pivot table with columns for the occupations and values as frequency counts.
        pivot_table = groups_df.pivot_table(index=category, columns=plotted_attribute, aggfunc='size', fill_value=0)
        # Draw a heatmap of occupations.
        sns.heatmap(pivot_table, annot=False, cmap="Blues")

    # If drawing plots by model, get the name of the model from the output path.
    if all_models:
        model_name = "All Models"
    else:
        model_name = output_path[:output_path.index("/")]
        # Map the output path text to the full model name. 
        model_mappings = {
            "gpt_4o_mini": "GPT-4o mini",
            "claude_3.5_sonnet": "Claude 3.5 Sonnet",
            "command_r_plus": "Command R+",
            "llama_3.1_70b": "Llama 3.1 70B",
        }
        model_name = model_mappings[model_name]

    # Add a title.
    # Adjust the x-positioning based on the attribute being plotted.
    suptitle_x = 0.55
    if plotted_attribute == "occupation" and category != "age":
        suptitle_x -= 0.10
        if category == "ethnicity_and_race" and model_name != "Claude 3.5 Sonnet":
            suptitle_x += 0.05

    plt.suptitle(
        t=" ".join([word.capitalize() for word in plotted_attribute.replace("_", " ").split()]) 
            + " Distribution by " 
            + " ".join([word.capitalize() for word in category.replace("_", " ").split()])
            + (" (implicit)" if implicit else " (explicit)"),
        fontsize=12,
        color="black",
        x=suptitle_x,
        y=0.95,
        horizontalalignment='center'
    )
                
    # Use the model name as the subtitle of the plot.
    title_x = 0.5

    plt.title(
        label=model_name,
        fontsize=12, 
        color='grey',
        x=title_x,
        horizontalalignment='center'
    )
    
    # Prevent the y-axis labels from being cut off.
    plt.tight_layout()

    # Save the plot in landscape mode if drawing an occupation heatmap.
    if plotted_attribute == "occupation":
        plt.savefig(output_path, bbox_inches="tight", orientation="portrait")
    else:
        # Save the figure. If the plot is by gender, add "_by_gender" to the end of the file path.
        if by_gender:
            plt.savefig(output_path + "_by_gender", bbox_inches="tight")
        else:
            plt.savefig(output_path, bbox_inches="tight")

    # Clear the figure.
    plt.clf()

## Drawing Plots for Texts from Implicit Bias Prompts
Let's draw plots for the implicit bias texts.

In [None]:
# Load information for accessing the JSON files.
prompts_directory = "../1_prompt_engineering"
texts_directory = "../2_generating_and_preprocessing_texts"
# List of folders containing the generated texts for each model.
all_implicit_text_folders = [
    f"{texts_directory}/gpt_4o_mini/implicit/",
    f"{texts_directory}/command_r_plus/implicit/",
    f"{texts_directory}/claude_3.5_sonnet/implicit/",
    f"{texts_directory}/llama_3.1_70b/implicit/"
]

# Draw plots based on each model's generated texts.
for folder in all_implicit_text_folders:
    print("Current model:", folder.replace(f"{texts_directory}/", "").replace("/implicit/", "").replace("/explicit/", ""))
    implicit_prompt_types_df = pd.read_csv(f"{prompts_directory}/implicit_prompt_types.csv")
    implicit_jsons = implicit_prompt_types_df["json_name"]
    implicit_texts_folder = folder

    # Get the number of prompt types (same as the number of JSON files).
    num_prompt_types = implicit_prompt_types_df.shape[0]

    # Create a list of numeric attributes.
    numeric_attributes = ["total_height", "polarity", "subjectivity"]
    # Create a list of categorical attributes.
    categorical_attributes = ["religion", "politics", "socioeconomic_status",
                            "sexual_orientation", "occupation"]

    # Create a plot for each category of bias.
    # Store the last category of bias found.
    last_category = implicit_prompt_types_df.category.iloc[0]
    # Store the current list of file paths for the current category.
    curr_file_paths = []

    # Iterate through the JSON files.
    for file_num in range(0, num_prompt_types):
        # If we are at the end of the current category i.e. last_category is different
        # from the current category, create and save plots for each numeric attribute.
        curr_category = implicit_prompt_types_df.category.iloc[file_num]

        if last_category != curr_category:
            # Create the output path that the plots will be stored in.
            model_name = folder.replace(texts_directory, "").replace("/implicit/", "").replace("/explicit/", "").replace("/", "")
            output_path = model_name + "/implicit/" + last_category.lower().replace(" ", "_") + "/"

            # Create and save plots for each numeric attribute.
            for attribute in numeric_attributes:
                # Draw the plot by gender.
                draw_numerical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"), 
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=True,
                                    by_gender=True if last_category.lower() != "gender" else False)
                # Draw the plot with genders combined.
                draw_numerical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"), 
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=True,
                                    by_gender=False)
                
            # Create and save plots for each categorical attribute.
            for attribute in categorical_attributes:
                # Skip if the category and the attribute are the same e.g. will plot gender by gender.
                if attribute.lower().replace(" ", "_") == last_category.lower().replace(" ", "_"):
                    continue

                # The last_category is on the x-axis. The attribute is on the y-axis.
                draw_categorical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"),
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=True,
                                    by_gender=True if last_category.lower() != "gender" else False)
                draw_categorical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"),
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=True,
                                    by_gender=False)


            # Reset curr_file_paths to an empty list.
            curr_file_paths = []

        # Create the path to the current JSON file.
        json_path = implicit_texts_folder + implicit_jsons.iloc[file_num]

        # Add the JSON file path to the list of file names.
        curr_file_paths.append(json_path)

        # Update the last category found.
        last_category = curr_category

    # Save plots for the the last category.
    model_name = folder.replace(texts_directory, "").replace("/implicit/", "").replace("/explicit/", "").replace("/", "")
    output_path = model_name + "/implicit/" + last_category.lower().replace(" ", "_") + "/"

    # Create and save plots for each numeric attribute.
    for attribute in numeric_attributes:
        # Draw the plot by gender.
        draw_numerical_plot(curr_file_paths, 
                            last_category.lower().replace(" ", "_"), 
                            attribute.lower().replace(" ", "_"), 
                            output_path + last_category.lower() + "_" + attribute,
                            implicit=True,
                            by_gender=True)
        # Draw the plot with genders combined.
        draw_numerical_plot(curr_file_paths, 
                            last_category.lower().replace(" ", "_"), 
                            attribute.lower().replace(" ", "_"), 
                            output_path + last_category.lower() + "_" + attribute,
                            implicit=True,
                            by_gender=False)
    
    # Create and save plots for each categorical attribute.
    for attribute in categorical_attributes:
        # Skip if the category and the attribute are the same e.g. will plot gender by gender.
        if attribute.lower().replace(" ", "_") == last_category.lower().replace(" ", "_"):
            continue

        # The last_category is on the x-axis. The attribute is on the y-axis.
        draw_categorical_plot(curr_file_paths, 
                                last_category.lower().replace(" ", "_"), 
                                attribute.lower().replace(" ", "_"),
                                output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                implicit=True,
                                by_gender=True)
        
        draw_categorical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"),
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=True,
                                    by_gender=False)

## Drawing Plots for Texts from Explicit Bias Prompts
Let's also draw plots for the explicit bias texts.

In [None]:
# Load information for accessing the JSON files.
prompts_directory = "../1_prompt_engineering"
texts_directory = "../2_generating_and_preprocessing_texts"
# List of folders containing the generated texts for each model.
all_explicit_text_folders = [
    f"{texts_directory}/gpt_4o_mini/explicit/",
    f"{texts_directory}/command_r_plus/explicit/",
    f"{texts_directory}/claude_3.5_sonnet/explicit/",
    f"{texts_directory}/llama_3.1_70b/explicit/"
]   

# Draw plots based on each model's generated texts.
for folder in all_explicit_text_folders:
    print("Current model:", folder.replace(f"{texts_directory}/", "").replace("/implicit/", "").replace("/explicit/", ""))
    explicit_prompt_types_df = pd.read_csv(f"{prompts_directory}/explicit_prompt_types.csv")
    explicit_jsons = explicit_prompt_types_df["json_name"]
    explicit_texts_folder = folder

    # Get the number of prompt types (same as the number of JSON files).
    num_prompt_types = explicit_prompt_types_df.shape[0]

    # Create a list of numeric attributes.
    numeric_attributes = ["total_height", "polarity", "subjectivity"]
    # Create a list of categorical attributes.
    categorical_attributes = ["religion", "politics", "socioeconomic_status",
                            "sexual_orientation", "occupation"]

    # Create a plot for each category of bias.
    # Store the last category of bias found.
    last_category = explicit_prompt_types_df.category.iloc[0]
    # Store the current list of file paths for the current category.
    curr_file_paths = []

    # Iterate through the JSON files.
    for file_num in range(0, num_prompt_types):
        # If we are at the end of the current category i.e. last_category is different
        # from the current category, create and save plots for each numeric attribute.
        curr_category = explicit_prompt_types_df.category.iloc[file_num]

        if last_category != curr_category:
            # Create the output path that the plots will be stored in.
            model_name = folder.replace(texts_directory, "").replace("/implicit/", "").replace("/explicit/", "").replace("/", "")
            output_path = model_name + "/explicit/" + last_category.lower().replace(" ", "_") + "/"

            # Create and save plots for each numeric attribute.
            for attribute in numeric_attributes:
                # Draw the plot by gender.
                draw_numerical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"), 
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=False,
                                    by_gender=True if last_category.lower() != "gender" else False)
                # Draw the plot with genders combined.
                draw_numerical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"), 
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=False,
                                    by_gender=False)
                
            # Create and save plots for each categorical attribute.
            for attribute in categorical_attributes:
                # Skip if the category and the attribute are the same e.g. will plot gender by gender.
                if attribute.lower().replace(" ", "_") == last_category.lower().replace(" ", "_"):
                    continue

                # The last_category is on the x-axis. The attribute is on the y-axis.
                draw_categorical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"),
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=False,
                                    by_gender=True)
                
                draw_categorical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"),
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=False,
                                    by_gender=False)


            # Reset curr_file_paths to an empty list.
            curr_file_paths = []

        # Create the path to the current JSON file.
        json_path = explicit_texts_folder + explicit_jsons.iloc[file_num]

        # Add the JSON file path to the list of file names.
        curr_file_paths.append(json_path)

        # Update the last category found.
        last_category = curr_category

    # Save plots for the the last category.
    model_name = folder.replace(texts_directory, "").replace("/implicit/", "").replace("/explicit/", "").replace("/", "")
    output_path = model_name + "/explicit/" + last_category.lower().replace(" ", "_") + "/"


    # Create and save plots for each numeric attribute.
    for attribute in numeric_attributes:
        # Draw the plot by gender.
        draw_numerical_plot(curr_file_paths, 
                            last_category.lower().replace(" ", "_"), 
                            attribute.lower().replace(" ", "_"), 
                            output_path + last_category.lower() + "_" + attribute,
                            implicit=False,
                            by_gender=True)
        # Draw the plot with genders combined.
        draw_numerical_plot(curr_file_paths, 
                            last_category.lower().replace(" ", "_"), 
                            attribute.lower().replace(" ", "_"), 
                            output_path + last_category.lower() + "_" + attribute,
                            implicit=False,
                            by_gender=False)
    
    # Create and save plots for each categorical attribute.
    for attribute in categorical_attributes:
        # Skip if the category and the attribute are the same e.g. will plot gender by gender.
        if attribute.lower().replace(" ", "_") == last_category.lower().replace(" ", "_"):
            continue

        # The last_category is on the x-axis. The attribute is on the y-axis.
        draw_categorical_plot(curr_file_paths, 
                                last_category.lower().replace(" ", "_"), 
                                attribute.lower().replace(" ", "_"),
                                output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                implicit=False,
                                by_gender=True if last_category.lower() != "gender" else False)
        
        draw_categorical_plot(curr_file_paths, 
                                    last_category.lower().replace(" ", "_"), 
                                    attribute.lower().replace(" ", "_"),
                                    output_path + last_category.lower().replace(" ", "_") + "_" + attribute,
                                    implicit=False,
                                    by_gender=False)