In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
def read_in_file(model_name):
    filename = f'../results/1_{model_name}_results.csv'
    df = pd.read_csv(filename, on_bad_lines='warn')
    df = df.drop(['Ethnicity Probability'], axis=1).rename(columns={'group': 'Target Ethnicity'})

    return df

In [3]:
def add_ethnicity_group(df):
    asian = ['CHINESE', 'JAPANESE', 'KOREAN', 'THAI']
    european = ['ENGLISH', 'FRENCH', 'GERMAN', 'GREEK', 'HUNGARIAN', 'ITALIAN', 'NORDIC', 'DUTCH']
    drop = ['AFRICAN', 'ISRAELI', 'TURKISH', 'BALTIC', 'SLAV', 'INDIAN']
    arab = ['ARAB']
    hispanic = ['HISPANIC']

    df = df[~df['Ethnicity'].isin(drop)]
    df = df[df['Target Ethnicity'] != 'nonce']

    df['Ethnicity Group'] = df['Ethnicity'].apply(lambda x: 'white' if x in european else 
                                                    'hispanic' if x in hispanic else 
                                                    'arab' if x in arab else 
                                                    'asian' if x in asian else None)
    return df


In [4]:
def get_proportional_perplexities(df): 

    cluster_summary = []

    overall_mean = df['perplexity_1'].mean()

    for ethnicity in df['Group'].unique():
        cluster_data = df[(df['Group'] == ethnicity)]
        
        # Get the average perplexity for the current group
        cluster_ethnicity = cluster_data['Group'].iloc[0]
        average_perplexity = round(cluster_data['perplexity_1'].mean(), 2)
        prop_perplexity = round(cluster_data['perplexity_1'].mean() / overall_mean, 3)

        cluster_summary.append({'Group': cluster_ethnicity, 
        'Average Perplexity': average_perplexity, 'Proportional Perplexity': prop_perplexity})

    prop_df = pd.DataFrame(cluster_summary)

    prop_df = prop_df.sort_values(by='Average Perplexity')

    df = pd.merge(df, prop_df, on='Group', how='left')

    # Calculate Adjusted Perplexity
    df['Adjusted Perplexity'] = df['perplexity_1'] / df['Proportional Perplexity']
    df = df.drop(['Average Perplexity', 'Proportional Perplexity'], axis=1)

    return df

In [5]:
def get_summary_df(df, apx):
    # Calculate the average perplexity for each cluster and descriptor
    cluster_summary = []

    # Get unique combinations of 'Target Gender', 'Target Ethnicity', and 'descriptor'
    unique_combinations2 = list(itertools.product(df['Target Ethnicity'].unique(), df['descriptor'].unique()))

    for ethnicity in df['Ethnicity Group'].unique():
        for target_ethnicity, descriptor in unique_combinations2:
            # Filter the DataFrame to get rows matching the current combination and group_id
            group_df = df[(df['Ethnicity Group'] == ethnicity) 
                        & (df['Target Ethnicity'] == target_ethnicity) 
                        & (df['descriptor'] == descriptor)]

            # Check if any rows exist for the current combination and group_id
            if not group_df.empty:
                # Get the ethnicity, gender, and average perplexity for the current cluster
                cluster_ethnicity = group_df['Ethnicity Group'].iloc[0]
                if apx:
                    average_perplexity = round(group_df['Adjusted Perplexity'].mean(), 2)
                else:
                    average_perplexity = round(group_df['perplexity_1'].mean(), 2)

                cluster_summary.append({'Ethnicity Group': cluster_ethnicity, 'Average Perplexity': average_perplexity, 
                                        'descriptor': descriptor, 'Target Ethnicity': target_ethnicity})

    summary_df = pd.DataFrame(cluster_summary)

    summary_df = summary_df.sort_values(by='descriptor')

    return(summary_df)


In [6]:
def mean_reciprocal_rank(rs):
    """
    Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank

    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105

    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank

    """
    
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

In [7]:
def get_mrr_df(df):
    # Calculate the average perplexity for each cluster and descriptor
    rank_array = []

    for descriptor in df['descriptor'].unique():
        # Get only rows pertaining to descriptors
        group_df = df[(df['descriptor'] == descriptor)]
        # Rank the average perplexities for each group
        ranked_df = group_df.sort_values(by='Average Perplexity').reset_index(drop=True)
        # Find the index of the row where Target Ethnicity matches
        target_index = ranked_df[ranked_df['Ethnicity Group'] == ranked_df.iloc[0]['Target Ethnicity']].index[0]
        # Create the desired array [0, 1, 0, 0]
        result_array = [int(i == target_index) for i in range(len(ranked_df))]
        rank_array.append(result_array)
    
    mrr = round(mean_reciprocal_rank(rank_array), 3)
    print(mrr)
    return mrr


In [8]:

def print_full_accuracy_table(model_name, apx):
    df = read_in_file(model_name)
    df = add_ethnicity_group(df)
    df = get_proportional_perplexities(df)
    summary_df = get_summary_df(df, apx)
    mrr = get_mrr_df(summary_df)    
    return mrr


In [9]:
model_list = ['google-bert_bert-large-cased', 'roberta-large', 'gpt2-xl', 'google_flan-ul2', 'EleutherAI_gpt-neox-20b',
               'facebook_opt-30b', 'meta-llama_Meta-Llama-3-8B']

for model in model_list:
    print(model)
    print_full_accuracy_table(model, False)
    print_full_accuracy_table(model, True)


google-bert_bert-large-cased
0.581
0.636
roberta-large
0.566
0.691
gpt2-xl
0.542
0.665
google_flan-ul2
0.593
0.629
EleutherAI_gpt-neox-20b
0.545
0.591
facebook_opt-30b
0.557
0.661
meta-llama_Meta-Llama-3-8B
0.589
0.703
