### Imports

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from IPython.display import display
import json
import numpy as np
import pandas as pd
import os
import random
import re
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import tensorflow as tf

### Read scored test data

In [None]:
standard_data_path = 'gs://conversationai-models/biosbias/scored_data/test_standard_0409.csv'
scrubbed_data_path = 'gs://conversationai-models/biosbias/scored_data/test_very_scrubbed_0409.csv'

perf_df = pd.read_csv(tf.gfile.Open(standard_data_path)).drop_duplicates(subset=['tokens'])
scrubbed_df = pd.read_csv(tf.gfile.Open(scrubbed_data_path)).drop_duplicates(subset=['tokens'])

In [None]:
print(perf_df.shape)
print(scrubbed_df.shape)

In [None]:
df = perf_df.join(scrubbed_df, rsuffix = '_scrubbed')

In [None]:
df.shape

In [None]:
df = df.dropna()
print(df.shape)

### Preprocessing

In [None]:
def get_class_from_col_name(col_name):
    #print(col_name)
    pattern = r'^.*_(\d+)$'
    return int(re.search(pattern, col_name).group(1))

In [None]:
def find_best_class(df, model_name, class_names):
    model_class_names = ['{}_{}'.format(model_name, class_name) for class_name in class_names]
    sub_df = df[model_class_names]
    df['{}_class'.format(model_name)] = sub_df.idxmax(axis=1).apply(get_class_from_col_name)

In [None]:
MODEL_NAMES = {
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117': 'glove',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_174837': 'debiased_tolga',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_174941': 'debiased_biosbias',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_175003': 'strongdebias_1',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_175019': 'strongdebias_2',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_175034': 'strongdebias_3',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_175055': 'strongdebias_4', 
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_175113': 'no_equalize',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_175131': 'no_proj',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190410_175254': 'very_scrubbed'
}

In [None]:
CLASS_NAMES = range(33)

In [None]:
for _model in MODEL_NAMES:
    find_best_class(df, _model, CLASS_NAMES)

In [None]:
# Labels with either gender having too few examples
bad_labels = df.groupby('label').gender.value_counts().reset_index(name = 'count').query('count < 4').label.values
assert len(bad_labels) == 0

### Accuracy Calculation

In [None]:
accuracy_list = []
for _model in MODEL_NAMES:
    is_correct = (df['{}_class'.format(_model)] == df['label'])
    _acc = sum(is_correct)/len(is_correct)
    accuracy_list.append(_acc)
    print ('Accuracy for model {}: {}'.format(MODEL_NAMES[_model], _acc))

### Fairness Metrics

In [None]:
for _class in CLASS_NAMES:
    df['label_{}'.format(_class)] = (df['label'] == _class)

In [None]:
# Gender ratios of classes
gender_counts = df.groupby('label').gender.value_counts().reset_index(name = 'count')

In [None]:
def frac_female(df):
    m_count = df[df['gender'] == "M"]['count'].values[0]
    f_count = df[df['gender'] == "F"]['count'].values[0]
    return {'label': df['label'].values[0], 'frac_female': f_count/(m_count+f_count)}

In [None]:
frac_female_df = pd.DataFrame(list(gender_counts.groupby('label', as_index = False).apply(frac_female)))

In [None]:
## WITH THRESHOLD

# def compute_tpr(df, _class, _model, threshold = 0.5):
#     tpr = metrics.recall_score(df['label_{}'.format(_class)],
#                                df['{}_{}'.format(_model,_class)] > threshold)
#     return tpr
    
# def compute_tpr_tnr(df, _class, _model, threshold = 0.5):
#     #cm = metrics.confusion_matrix(df['label_{}'.format(_class)],
#     #                              df['{}_{}'.format(_model,_class)] > threshold)
#     cm = pd.crosstab(df['label_{}'.format(_class)], df['{}_{}'.format(_model,_class)] > threshold)
#     #display(cm)
#     if cm.shape[0] > 1:
#         tn = cm.iloc[0,0]
#         fp = cm.iloc[0,1]
#         fn = cm.iloc[1,0]
#         tp = cm.iloc[1,1]
#         tpr = tp/(tp+fn)
#         tnr = tn/(tn+fp)
#     else:
#         tpr = 0
#         tnr = 1
#     return tpr, tnr

In [None]:
def compute_tpr(df, _class, _model, threshold=None):    
    tpr = metrics.recall_score(df['label_{}'.format(_class)],
                               df['{}_class'.format(_model)] == _class)
    return tpr

def compute_tpr_tnr(df, _class, _model, threshold=None):
    
    true_col = 'label_{}'.format(_class)
    pred_col = '{}_class'.format(_model)
    tn = len(df.loc[(df[true_col] == False) & (df[pred_col] != _class)])
    fp = len(df.loc[(df[true_col] == False) & (df[pred_col] ==_class)])
    fn = len(df.loc[(df[true_col] == True) & (df[pred_col] != _class)])
    tp = len(df.loc[(df[true_col] == True) & (df[pred_col] ==_class)])

    if tp + fn == 0:
        raise ValueError('class {} has no positive - impossible to define tpr'.format(_class))

    if tn + fp == 0:
        tpr = 1
        tnr = 0
    else:
        tpr = tp/(tp+fn)    
        tnr = tn/(tn+fp)
    return tpr, tnr

In [None]:
def compute_tpr_by_gender(df, _class, _model, threshold = 0.5):
    tpr_m = compute_tpr(df.query('gender == "M"'), _class, _model, threshold)
    tpr_f = compute_tpr(df.query('gender == "F"'), _class, _model, threshold)
    return {'M': tpr_m, 'F': tpr_f}

def compute_tr_by_gender(df, _class, _model, threshold = 0.5):
    tpr_m, tnr_m = compute_tpr_tnr(df.query('gender == "M"'), _class, _model, threshold)
    tpr_f, tnr_f = compute_tpr_tnr(df.query('gender == "F"'), _class, _model, threshold)
    return {'TPR_m': tpr_m, 'TPR_f': tpr_f, 'TNR_m': tnr_m, 'TNR_f': tnr_f}

In [None]:
for _class in CLASS_NAMES:
    true_col = 'label_{}'.format(_class)
    if len(df.loc[(df[true_col] == True)]) == 0:
        continue
    for _model in MODEL_NAMES:
        tpr_1 = compute_tpr(df, _class, _model)
        tpr_2, _ = compute_tpr_tnr(df, _class, _model)
        assert tpr_1 == tpr_2, '{} != {}'.format(tpr_1, tpr_2)
        #print('{} == {}'.format(tpr_1, tpr_2))

In [None]:
tpr_df = pd.DataFrame()
for _class in frac_female_df.label:
    row = {}
    row['label'] = _class
    for _model, _model_type in MODEL_NAMES.items():
        tpr, tnr = compute_tpr_tnr(df, _class, _model)
        row['{}_tpr'.format(_model_type)] = tpr
        row['{}_tnr'.format(_model_type)] = tnr
        gender_trs = compute_tr_by_gender(df, _class, _model)
        row['{}_tpr_F'.format(_model_type)] = gender_trs['TPR_f']
        row['{}_tpr_M'.format(_model_type)] = gender_trs['TPR_m']
        row['{}_tpr_gender_gap'.format(_model_type)] = gender_trs['TPR_f'] - gender_trs['TPR_m']
        row['{}_tnr_F'.format(_model_type)] = gender_trs['TNR_f']
        row['{}_tnr_M'.format(_model_type)] = gender_trs['TNR_m']
        row['{}_tnr_gender_gap'.format(_model_type)] = gender_trs['TNR_f'] - gender_trs['TNR_m']
    tpr_df = tpr_df.append(row, ignore_index = True)

In [None]:
results_df = pd.merge(tpr_df, frac_female_df, on = 'label')

In [None]:
TITLE_LABELS = [
    'accountant', 'acupuncturist', 'architect', 'attorney', 'chiropractor', 'comedian', 'composer', 'dentist',
    'dietitian', 'dj', 'filmmaker', 'interior_designer', 'journalist', 'landscape_architect', 'magician',
    'massage_therapist', 'model', 'nurse', 'painter', 'paralegal', 'pastor', 'personal_trainer',
    'photographer', 'physician', 'poet', 'professor', 'psychologist', 'rapper',
    'real_estate_broker', 'software_engineer', 'surgeon', 'teacher', 'yoga_teacher']

In [None]:
results_df['label_profession'] = results_df['label'].apply(lambda x: TITLE_LABELS[int(x)])

In [None]:
results_df[['frac_female']+['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]].corr()[['frac_female']]
    

In [None]:
tpr_gender_gap_cols = ['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]
tnr_gender_gap_cols = ['{}_tnr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]

In [None]:
gender_gap_df = results_df[['label_profession', 'frac_female']+tpr_gender_gap_cols+tnr_gender_gap_cols]
#gender_gap_df.columns = ['label_profession', 'frac_female']+['{}'.format(_model) for _model in MODEL_NAMES.values()]

In [None]:
gender_gap_df.sort_values('frac_female', ascending = False)

In [None]:
# Fraction of comments where new model has lower
# TPR gap than the baseline

def compute_fraction_improved(df, baseline_model, improved_model):
    is_improved = np.abs(df[baseline_model]) >= np.abs(df[improved_model])
    return np.mean(is_improved)

In [None]:
# for _model in MODEL_NAMES.values():
#     print(_model)
#     print(compute_fraction_improved(gender_gap_df, 'glove_untuned_tpr_gender_gap', '{}_tpr_gender_gap'.format(_model)))

In [None]:
tpr_cols = ['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]
tnr_cols = ['{}_tnr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]
gender_gap_cols = tpr_cols + tnr_cols

In [None]:
gender_gap_df[gender_gap_cols].apply(lambda x: np.mean(x**2))

In [None]:
gender_gap_df[gender_gap_cols].apply(lambda x: np.mean(np.abs(x)))

In [None]:
def plot_tpr_gap(df, _model):
    fig, ax = plt.subplots(figsize=(15, 6))
    x = 'frac_female'
    y = '{}_tpr_gender_gap'.format(_model)
    p1 = sns.regplot(x = x, y = y, data = df)
    p1.set(xlabel = "% Female", ylabel = "TPR Gender Gap", title = _model)

    for line in range(0,df.shape[0]):
         p1.text(results_df[x][line]+0.01, df[y][line], df['label_profession'][line], horizontalalignment='left', size='medium', color='black')
    plt.show()

In [None]:
for _model in MODEL_NAMES.values():
    if 'untuned' in _model:
        plot_tpr_gap(results_df, _model)

In [None]:
results_df[['frac_female']+['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]].corr()[['frac_female']]