# AMT

## Setup

In [None]:
from scipy import stats
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import random
from itertools import combinations, permutations
import warnings
import random
import matplotlib.pyplot as plt
from sklearn import datasets
import seaborn as sns
sns.set()
sns.set_style('white') 
from IPython.display import clear_output
clear_output()

## Inter-rater reliability

For each sample, there are 3 rows with annotation from each annotator.

Before selecting majority vote, remove intersectional samples and compute inter-annotator agreement using Fleiss' kappa (appropriate for different raters comprising the 3 per sample) and Spearman's correlation (treating valences as ordinal) by APIAA/AMIAA methods, modified to suit crowd annotation.

### Definitions

In [None]:
def get_turkSR(filename: str = "my_turk_annotated.csv", df: bool = False):
    """Define and clean dataframe of AMT results."""
    # Extract and numericalize pertinent columns from MTurk results.
    turkdf = pd.read_csv(f"drive/My Drive/csc699/{filename}", # AMT results .csv
                        converters={"Answer.regard.label": lambda x: x.replace("Neutral", "0")
                                                                      .replace("Positive", "1")
                                                                      .replace("Negative", "-1")
                                                                      .replace("N/A", "2")})
    if not df:
        turkSR = turkdf.iloc[:, 27:29] # Extract Sample, Regard columns.
        turkSR = turkSR.rename(columns={"Input.text": "Sample", 
                                        "Answer.regard.label": "Regard"})
    else:
        turkSR = turkdf
    return turkSR
    
def remove_intersectional(df = None, is_vader=False, is_intrs: bool = False):
    """
    Retrieve intersectional samples to use 
    in reference dataframe for filtering main AMT dataframe.
    """
    turkSR = df if is_vader else get_turkSR()

    intersectional_samples_df = pd.read_csv("drive/My Drive/csc699/pre_annotated_int.csv", 
                                             header=None,
                                             sep="\t")
        
    if not is_intrs:
        # Get intersectional indices to drop.
        intersectional_indices = turkSR[turkSR.Sample.apply(lambda x: 
                                                            x in intersectional_samples_df[0].values)].index
    else:
        # Get non-intersectional indices to drop (or: keeping intersectional).
        intersectional_indices = turkSR[turkSR.Sample.apply(lambda x: 
                                                            x not in intersectional_samples_df[0].values)].index
    turkSR.drop(intersectional_indices, inplace=True) # Remove intersectional entries.
    turkSR.Regard = turkSR.Regard.apply(lambda reg: int(reg)) # Convert string score to integer.
    return turkSR

def fleiss_kappa(ratings: list = [], n: int = 3, k: int = 3):
    '''
    https://gist.github.com/ShinNoNoir/4749548
    Computes the Fleiss' kappa measure for assessing the reliability of 
    agreement between a fixed number n of raters when assigning categorical
    ratings to a number of items.
    
    Args:
        ratings: a list of tuples [(item1, category2), (item1, category1) ...]
        n: number of raters
        k: number of categories
    Returns:
        the Fleiss' kappa score
    
    See also:
        http://en.wikipedia.org/wiki/Fleiss'_kappa
    '''
    items = set()
    categories = set()
    n_ij = {}
    
    for i, c in ratings:
        if c != 2:
            items.add(i)
            categories.add(c)
            n_ij[(i,c)] = n_ij.get((i,c), 0) + 1
    
    N = len(items)
    
    p_j = {}
    for c in categories:
        p_j[c] = sum(n_ij.get((i,c), 0) for i in items) / (1.0*n*N)
    
    P_i = {}
    for i in items:
        P_i[i] = (sum(n_ij.get((i,c), 0)**2 for c in categories)-n) / (n*(n-1.0))
    
    P_bar = sum(P_i.values()) / (1.0*N)
    P_e_bar = sum(p_j[c]**2 for c in categories)
    
    kappa = (P_bar - P_e_bar) / (1 - P_e_bar)
    
    return kappa

def get_combo_df(is_intrs: bool = False):
    turkSIDR = get_turkSR(df=True)
    turkSIDR = turkSIDR.rename(columns={'Answer.regard.label': 'Regard', 
                                        'Input.text': 'Sample'})
    if not is_intrs:
        turkSIDR = remove_intersectional(df=turkSIDR, 
                                         is_vader=True,
                                         is_intrs = False)
    else:
        turkSIDR = remove_intersectional(df=turkSIDR, 
                                         is_vader=True,
                                         is_intrs = True)
    keep = ['WorkerId', 'Regard', 'Sample']
    turkSIDR_keep = pd.DataFrame()
    for i in list(turkSIDR):
        if i in keep:
            turkSIDR_keep[i] = turkSIDR[i]
    turkSIDR_keep.reset_index(level=0, inplace=True)
    return turkSIDR_keep
    
def get_combo_dict(turk_df):
    df = pd.pivot_table(turk_df, 
                        index='Sample', 
                        columns='WorkerId', 
                        values='Regard')
    worker_ids = list(set(turk_df.WorkerId))
    wid_dict = {wid: dict() for wid in worker_ids}
    for worker_id in worker_ids:
        samples = list(turk_df.Sample[turk_df.WorkerId == worker_id])
        for sample in samples:
            sample_df = pd.DataFrame(df.loc[sample]).T.dropna(axis=1)
            tups = [(wid, sample_df[wid][0]) for wid in list(sample_df)]
            scores = [tup[1] for tup in tups]
            if 2 not in scores: # Remove N/A.
                combos = combinations(tups, 2)
                combos = list(filter(lambda x: worker_id in [x[0][0], x[1][0]], combos))
                wid_dict[worker_id][sample] = combos
            else:
                continue
    return wid_dict

def get_worker_agreement_percentage(wid_dict: dict,
                                    worker_id: str,
                                    judges: int = 3):
    """
        Compute basic agreement.
    """
    cnt = 0
    pairs = wid_dict[worker_id].values()
    score_pairs = map(lambda x: (x[0][0][1], x[0][1][1]), # 1st, 2nd scores
                      pairs)
    for scores in score_pairs:
        if scores[0] == scores[1]:
            cnt += 1
    print(f'Worker {worker_id} agrees with the other {judges-1} annotators', 
          f'{round(cnt/len(pairs) * 100)}% of the time.\n')

def get_id_dict(df, 
                wid_dict: dict,
                worker_id: str):
    """
    Return dictionary of ids, scores
    for each judge worked with in annotations dataset.
    """
    ids = pd.Series({id_: [] for id_ in df.WorkerId.values})
    for sample, id_score_pairs in wid_dict[worker_id].items():
        # sample, [(id_, score), (p_id, score)]
        id_score_pairs = id_score_pairs[0]
        partner_id = id_score_pairs[0][0]
        main_id = id_score_pairs[1][0]
        partner_score = id_score_pairs[0][1]
        main_score = id_score_pairs[1][1]
        if partner_id != worker_id:
            ids[partner_id].append((partner_score, main_score))
        elif main_id != worker_id:
            ids[main_id].append((partner_score, main_score))
    return {k: v for k, v in ids.items() if v}, ids

def get_id_apiaa(id_dict: dict):
    warnings.filterwarnings("ignore", 
                            category=RuntimeWarning) 
    rhos_ = []
    for partner_id, score_pairs in id_dict.items():
        """
        score_pairs: rows = samples/observations, 
                     cols = each judge's scores/variables
                     (assuming spearmanr axis=0)
        """
        # Correlation for all pairs co-scored with this partner.
        if len(score_pairs) > 1:
            rho, pval = stats.spearmanr(score_pairs, 
                                        axis=0, 
                                        nan_policy='omit')
            if not np.isnan(rho):
                rhos_.append(rho)
    n = len(rhos_) # Accounting for nan changing number of annotators.
    if n > 1:
        sigma = sum(rhos_) # Sum correlations pairwise.
        apiaa = sigma/n
    else:
        apiaa = 0
    warnings.filterwarnings("default", category=RuntimeWarning) 
    return apiaa, rhos_

def compute_apiaa(turk_df, wid_dict, judges: int = 3):
    """
    Average Pairwise Inter-Annotator Agreement
    APIAA: https://arxiv.org/pdf/2003.04866.pdf
    AKA IAA-1: https://www.aclweb.org/anthology/D16-1235.pdf
    """
    apiaa_sum = 0
    apiaa_series = pd.Series(zip([], list(wid_dict)))
    n = len(wid_dict)
    for worker_id in list(wid_dict):
        ids, _ = get_id_dict(df=turk_df, 
                          wid_dict=wid_dict, 
                          worker_id=worker_id)
        apiaa_i, rhos_ = get_id_apiaa(ids)
        apiaa_sum += apiaa_i
        apiaa_series[worker_id] = rhos_
    return apiaa_sum/n, apiaa_series

def get_amiaa(turk_df, wid_dict):
    """
    Average Mean Inter-Annotator Agreement
    AMIAA: https://arxiv.org/pdf/2003.04866.pdf
    AKA IAA-2: https://www.aclweb.org/anthology/D16-1235.pdf
    """
    worker_ids = list(wid_dict)
    amiaa = 0
    warnings.filterwarnings("ignore", category=RuntimeWarning) 
    for i in worker_ids:
        mu_i = []
        s_i = []
        ids, _ = get_id_dict(df=turk_df, wid_dict=wid_dict, worker_id=i)
        for sample, score_pairs in wid_dict[i].items():
                # Compute mean of co-judge scores for this sample.
                # By number of co-judges (i.e., 2).
                mu = sum([pair[1][1] 
                        for pair in score_pairs
                        if pair[1][0] != i])/(len(score_pairs))
                # Extract judge i's score for this sample.
                s = [judge[1] 
                        for pair in score_pairs 
                        for judge in pair 
                        if judge[0] == i][0]
                s_i.append(s)
                mu_i.append(mu)
        rho_i, _ = stats.spearmanr(s_i, mu_i) # Correlate i's scores for i's samples w/ avg of partner ratings.
        if not np.isnan(rho_i): # NaN likely due to no variance over small # of samples.
            # Average the mean IAA by total number of annotators.
            amiaa += rho_i/len(wid_dict)
    warnings.filterwarnings("default", category=RuntimeWarning) 
    return amiaa

def display_wid_mat(worker_id, turk_df):
    df = pd.pivot_table(turk_df, 
                        index='Sample', 
                        columns='WorkerId', 
                        values='Regard')
    #query = 'XYZ was well-known'
    #ix = list(map(lambda k: k[:len(query)], df.T.keys())).index(query)
    samples = list(turk_df.Sample[turk_df.WorkerId == worker_id])
    ndf = ''
    for ix, sample in enumerate(samples):
        try:
            sample_df = pd.DataFrame(df.loc[sample]).T.dropna(axis=1)
            fdf = (pd.DataFrame(df.loc[sample])
                    .T.dropna(axis=1))
            fdf = (fdf.reset_index()
            .rename(columns={'index': 'Sample'})
            .reset_index()
            .rename_axis(None, axis=1).drop('index', axis=1))
            if ix == 0:
                ndf = fdf
        except:
            pass
        try:
            if list(fdf):
                if ix > 0:
                    ndf = pd.concat([ndf, fdf])
        except:
            pass
    ndf = (ndf.reset_index()
              .rename(columns={'index': 'Sample'})
              .reset_index()
              .rename_axis(None, axis=1).drop('index', axis=1))


    ndf = pd.DataFrame(ndf.T.iloc[1:]).fillna('°')
    return ndf

def get_masked_clear_df(clear: str = "sampledv2.csv"):
    """Load XYZ, unmasked Turk data."""
    clear_df = pd.read_csv(f"drive/My Drive/csc699/{clear}") #No intersectional.
    clear_df = pd.DataFrame({"Sample": [s[:-2].rstrip().replace("\"", "'")
                                        for i in clear_df.loc[0, :] 
                                        for s in list(eval(i))]})
    return clear_df

def get_mask_clear_lst(ndf, 
                       clear: str = "sampledv2.csv"):
    """
    Create lists for conversion 
    and combination into dataframe.
    """
    clear_df = get_masked_clear_df(clear)
    masklst = ndf.T.Sample.values.tolist()
    clearlst = clear_df.Sample.values.tolist()
    ndf_sample = pd.Series(ndf.T.Sample)
    for clear_sample in clearlst:
        for ix, xyz_sample in enumerate(masklst):
            if xyz_sample[4:] in clear_sample:
                ndf.iloc[0, ix] = clear_sample
                break
    ndf = ndf.T
    ndf.insert(value=ndf_sample, column='Sample_XYZ', loc=1)
    ndf = ndf.T
    return ndf

### Get APIAA, AMIAA, Fleiss' kappa

In [None]:
turk_df = get_combo_df(is_intrs = False)
wid_dict = get_combo_dict(turk_df)

In [None]:
#worker_id = random.choice(turk_df.WorkerId.values)
worker_id = "A3E18P15E4GSG6"

ids, ids_series = get_id_dict(df=turk_df, wid_dict=wid_dict, worker_id=worker_id)
apiaa, rhos_ = get_id_apiaa(ids)
n = len(rhos_)
get_worker_agreement_percentage(wid_dict=wid_dict, worker_id=worker_id, judges=3)
#display(wid_dict[worker_id][random.choice(list(list(wid_dict[worker_id])))])
if n > 0:
    print(f'\nAverage pairwise correlation for samples worked on by {worker_id}: {sum(rhos_)/n}.\n')
#display(wid_dict[worker_id])

In [None]:
ids_df = pd.DataFrame(ids_series)
ids_df[ids_df[0].str.len() != 0]
pd.DataFrame(wid_dict[worker_id]).T

In [None]:
apiaa, apiaa_series = compute_apiaa(turk_df, wid_dict, judges=3)
print('\nAverage pairwise Spearman\'s correlation (APIAA) for all workers with partitioned dataset:\n', 
      round(apiaa, 2), '\n')
#display(apiaa_series[worker_id])
amiaa = get_amiaa(turk_df, wid_dict)
print('\nAverage Mean Spearman\'s correlation (AMIAA):\n', 
      round(amiaa, 2))

In [None]:
turkSR = remove_intersectional()
print("Fleiss' kappa:\n", fleiss_kappa(turkSR.values.tolist()))

In [None]:
worker_id = random.choice(turk_df.WorkerId.values)
#worker_id = 'A3E18P15E4GSG6'
print(worker_id, "\n")
ndf = display_wid_mat(worker_id, turk_df).iloc[:, :]

In [None]:
display(ndf)
ndf = ndf.loc[:, ~(ndf == 2).any()]
ndf

In [None]:
get_mask_clear_lst(ndf)

## VADER vs. majority regard correlations:

Compress three annotators' scores to majority.

In [None]:
def vote(arr):
    """Choose majority score or discard."""
    arr = arr.tolist()
    c1 = (0, arr.count(0))
    c2 = (-1, arr.count(-1))
    c3 = (1, arr.count(1))
    if c1[1] == c2[1] == c3[1]:
        # Choose random if tied.
        # v = random.choice(arr)
        v = None # Discard if tied.
    else: # No tie.
        v = sorted([c1, c2, c3], # Retrieve score/most votes.
                   key=lambda x: -x[1])[0][0]
    return v

def get_majority_df(intersectional: bool = False):
    """Return turkSR with majority scores."""
    if intersectional:
        turkSR = remove_non_intersectional()
    else:
        turkSR = remove_intersectional()
    reference_df = pd.DataFrame(turkSR.groupby("Sample", 
                                               sort=False).aggregate({"Regard": vote}))
    reference_df.reset_index(level=0, inplace=True) # Fix headers.
    reference_df.dropna(inplace=True) # Remove ties.
    reference_df.Regard = reference_df.Regard.apply(lambda x: int(x))
    return reference_df

In [None]:
reference_df = get_majority_df()

### Write formatted AMT to file.

In [None]:
remove_intersectional().to_csv("drive/My Drive/csc699/my_turk_no_intersectional.csv", index=False)
#remove_non_intersectional().to_csv("drive/My Drive/csc699/my_turk_intersectional.csv", index=False)

In [None]:
get_majority_df().to_csv("drive/My Drive/csc699/my_turk_majority_no_ties.csv", index=False)
#get_majority_df(True).to_csv("drive/My Drive/csc699/my_turk_majority_int_no_ties.csv", index=False)

### Compare VADER and human scores.

In [None]:
def get_vader_df():
    """Get dataframe for VADER sentiment scores."""
    vader_samples = pd.read_csv("drive/My Drive/csc699/combined_ano_xyz.csv")
    vader_labels = pd.read_csv("drive/My Drive/csc699/combined_ano_labs.csv")
    vader_samples = vader_samples.rename(columns={"text": "Sample"})
    vader_labels = vader_labels.rename(columns={"text": "Regard"})
    vader_df = pd.concat([vader_samples, vader_labels], axis=1)
    vader_df = remove_intersectional(vader_df, is_vader=True)
    return vader_df

def get_vader_turk_lists():
    """Create lists for comparison of sentiment, regard."""
    regard_df = get_majority_df()
    vader_df = get_vader_df()
    regardlst = regard_df.Sample.values.tolist()
    xyzlst = vader_df.values.tolist()
    vaderlst = list(filter(lambda sr: sr[0] in regardlst, xyzlst))
    turklst = regard_df.values.tolist()
    return (sorted(vaderlst, key=lambda t: t[0]), 
            sorted(turklst, key=lambda t: t[0]))

def print_vader_turk(n: int = None):
    """Display paired sentences with respective scores."""
    vaderlst, turklst = get_vader_turk_lists()
    n = n or len(vaderlst)
    print("\n")
    for turk, vader in zip(turklst[:n], vaderlst[:n]):
        print(f"{turk[0]}\n  MTurk Regard: {turk[1]}\n  VADER Sentiment: {vader[1]}")

def show_regard_props():
    temp = get_majority_df()
    print('# Positive: ', temp.Regard[temp.Regard > 0].count())
    print('# Neutral: ', temp.Regard[temp.Regard == 0].count())
    print('# Negative: ', temp.Regard[temp.Regard < 0].count())

In [None]:
show_regard_props()
# print_vader_turk(n = 10)

In [None]:
def show_sent_reg_corr():
    both = []
    respect = []
    occupation = []
    vaderlst, turklst = get_vader_turk_lists()
    for turk, vader in zip(turklst, vaderlst):
        both.append((turk[1], vader[1]))
        if "XYZ was" in turk[0]:
            respect.append((turk[1], vader[1]))
        else:
            occupation.append((turk[1], vader[1]))

    r, p = stats.spearmanr(both)
    respect_rho, respect_pval = stats.spearmanr(respect)
    occupation_rho, occupation_pval = stats.spearmanr(occupation)
    print('VADER vs. Regard (Both) rho:\n', r)
    print('VADER vs. Regard (Both) p-value:\n', p)
    print("\n___________\n")
    print('VADER vs. Regard (Respect) rho:\n', respect_rho)
    print('VADER vs. Regard (Occupation) rho:\n', occupation_rho)
    print("\n___________\n")
    print('VADER vs. Regard (Respect) p-value:\n', respect_pval)
    print('VADER vs. Regard (Occupation) p-value:\n', occupation_pval)

In [None]:
show_sent_reg_corr()

# Significance Tests (Monte Carlo)

#### Definitions

In [None]:
def accuracy_score(y_true, y_pred):
    """Test statistic for classifier permutation test."""
    corr = sum(y_t==y_p for y_t, y_p in zip(y_true, y_pred))
    n = len(y_true)
    score = corr/n
    return score

def load_classifier_data(folder: str = "nlg-bias", 
                         typ: str = "regard", 
                         intrs: str = ""):
    """Load AMT test set."""
    test_df = pd.read_csv(f"drive/My Drive/{folder}/data/{typ}/{intrs}test.tsv", 
                        sep="\t", 
                        header=None)
    test = [i[0] for i in test_df.values]
    return test

def classifier_bootstrap(data_A, data_B, n, R, amt="csc699"):
    """One-tailed bootstrap for comparing models."""
    y_true = load_classifier_data(amt)
    acc_a = accuracy_score(y_true, data_A)
    acc_b = accuracy_score(y_true, data_B)
    delta_orig = 2 * (acc_a - acc_b)
    cnt = 0
    temp_As = []
    temp_Bs = []
    for r in range(R):
        a_temp = random.choices(data_A, k=n) # w/ replacement: bootstrap
        b_temp = random.choices(data_B, k=n)
        temp_As.append(a_temp)
        temp_Bs.append(b_temp)
    for ix, (sample_a, sample_b) in enumerate(zip(temp_As, temp_Bs)):
        acc_a = accuracy_score(y_true, sample_a)
        acc_b = accuracy_score(y_true, sample_b)
        delta = acc_a - acc_b
        if delta > delta_orig:
            cnt += 1
    pval = float(cnt)/float(R)
    return pval

def classifier_permutation_test(data_A, data_B, n, R, amt="csc699"):
    """Two-tailed permutation test for comparing models."""
    y_true = load_classifier_data(amt)
    acc_a = accuracy_score(y_true, data_A)
    acc_b = accuracy_score(y_true, data_B)
    delta_orig = np.abs(acc_b - acc_a)

    cnt = 0
    temp_As = []
    temp_Bs = []
    
    for r in range(R):
        indices = random.sample(range(n), n) # without replacement: permutation
        temp_As.append([data_A[z] for z in indices])
        temp_Bs.append([data_B[z] for z in indices])

    for sample_a, sample_b in zip(temp_As, temp_Bs):
        delta = np.abs(accuracy_score(y_true, sample_b) - 
                       accuracy_score(y_true, sample_a))

        if delta > delta_orig:
            cnt += 1
            
    pval = float(cnt + 1)/float(R + 1)
    return pval

def data_permutation_test(data_A: list, 
                          data_B: list, 
                          R:int = 10000, 
                          demo1: str ='gay',
                          demo2: str = '',
                          main: str = 'A'):
    """Two-tailed permutation test for comparing datasets."""
    if not demo2:
        # Compare same demographic between datasets A and B.
        A = data_A[demo1].tolist()
        B = data_B[demo1].tolist()
    else:
        # Compare pair of demographics within dataset A.
        print(f'{main} counts:\n')
        if main == 'A':
            print(f'{demo1}: {data_A[demo1].value_counts(normalize=True) * 100}\n') # [-1] * 100:.1f}...
            print(f'{demo2}: {data_A[demo2].value_counts(normalize=True) * 100}')
            A = data_A[demo1].tolist()
            B = data_A[demo2].tolist()
        else:
            print(f'{demo1}: {data_B[demo1].value_counts(normalize=True) * 100}\n')
            print(f'{demo2}: {data_B[demo2].value_counts(normalize=True) * 100}')
            A = data_B[demo1].tolist()
            B = data_B[demo2].tolist()

    a = np.mean(A)
    b = np.mean(B)
    
    delta_orig = np.abs(b - a)

    cnt = 0
    deltas = []
    for r in range(R):
        pool = np.concatenate((A, B))
        np.random.shuffle(pool)
        
        sample_a = pool[:len(A)]
        sample_b = pool[-len(B):]
        
        sa = np.mean(sample_a)
        sb = np.mean(sample_b)
        
        delta = np.abs(sb - sa)
        deltas.append(delta)
        if delta > delta_orig:
            cnt += 1
            
    pval = float(cnt + 1)/float(R + 1)
    return pval, deltas, delta_orig

def get_context_dict(filename: str = "diff_regard1_small_gpt2_generated_samples.tsv_labeled"):
    """Get scores by context, demographic."""
    demo_df = pd.read_csv("drive/My Drive/csc699/demographics.txt", header=None)
    demo_lst = demo_df[0].tolist()
    templates = pd.read_csv("drive/My Drive/csc699/templates.txt", sep=",")
    temp_df = templates.groupby('bias_context').aggregate(lambda x: list(map(lambda y: y[4:], list(x))))
    temp_df.reset_index(inplace=True)
    respect = temp_df.iloc[1, 1]
    occupation = temp_df.iloc[0, 1]
    gpt2_df = pd.read_csv(f"drive/My Drive/nlg-bias/data/generated_samples/regard_diff/{filename}.tsv", 
                        header=None, sep="\t")
    dct = {'respect': {k: [] for k in demo_lst}, 
        'occupation': {k: [] for k in demo_lst}}
    for i, j in gpt2_df.values:
        for k in respect:
            if k in j:
                for d in demo_lst:
                    if d in j:
                        dct['respect'][d].append(i)
        for k in occupation:
            if k in j:
                for d in demo_lst:
                    if d in j:
                        dct['occupation'][d].append(i)
    return dct

def get_ratio(samples: list, 
              polarity: str = 'neg'):
    """Compute percentage of samples of given polarity."""
    d = {'pos': 1, 'neu': 0, 'neg': -1}
    clss = d[polarity]
    s = len([i for i in samples if int(i) == clss])
    n = len(samples)
    return s/n

def class_permutation_test(dct: dict, 
                           dct2: dict,
                           R:int = 10000, 
                           compare_contexts: bool = False,
                           compare_data: bool = False,
                           demo1: str ='gay',
                           demo2: str = 'straight',
                           polarity: str = 'neg',
                           context: str = ''):
    """Two-tailed permutation test."""
    demoA = f'The {demo1} person' if demo1[-2:] != 'an' else f'The {demo1}'
    demoB = f'The {demo2} person' if demo2[-2:] != 'an' else f'The {demo2}'
    pd = {'neg': 'negative', 'pos': 'positive', 'neu': 'neutral'}
    pr = (lambda p, d, c, s: 
            f'Ratio of {pd[p].capitalize()} scores for the {d.capitalize()} demographic' +
            f' in the {c.capitalize()} context: {s * 100:.1f}')
    if not compare_data and compare_contexts:
        print(f'Comparing {demo1.capitalize()} between the Respect and Occupation contexts.')
        A = dct['respect'][demoA]
        B = dct['occupation'][demoA]
    elif compare_data and not compare_contexts:
        print(f'Comparing \"{demoA.capitalize()}\" demographic between GPT-2 and LM1B in the {context.capitalize()} context.')
        A = dct[context][demoA]
        B = dct2[context][demoA]
    elif compare_data and compare_contexts:
        print(f'Comparing \"{demoA.capitalize()}\" demographic between GPT-2 and LM1B in both contexts.')
        A = dct['respect'][demoA]
        A.extend(dct['occupation'][demoA])
        B = dct2['respect'][demoA]
        B.extend(dct2['occupation'][demoA])
    else: # if not compare_data and not compare_contexts
        A = dct[context][demoA]
        B = dct[context][demoB]

    a = get_ratio(A, polarity)
    b = get_ratio(B, polarity)
    
    if not compare_data and not compare_contexts:
        print(pr(polarity, demo1, context, a))
        print(pr(polarity, demo2, context, b))

    delta_orig = np.abs(b - a)
    cnt = 0
    deltas = []
    for r in range(R):
        pool = np.concatenate((A, B))
        np.random.shuffle(pool)
        
        sample_a = pool[:len(A)]
        sample_b = pool[-len(B):]

        sa = get_ratio(sample_a, polarity)
        sb = get_ratio(sample_b, polarity)

        delta = np.abs(sb - sa)
        deltas.append(delta)
        if delta >= delta_orig:
            cnt += 1
            
    pval = float(cnt + 1)/float(R + 1)
    return pval, deltas, delta_orig

def plot_deltas(deltas, 
                delta_orig, 
                polarity: str = 'neg', 
                demo1='gay', 
                demo2='straight',
                context='respect'):
    """Show differences between samples' and original's test statistic across permutations."""
    d = {'neg': 'negative', 'pos': 'positive', 'neu': 'neutral'}
    delta_orig *= 1
    deltas = list(map(lambda x: x * 1, deltas))
    p1 = sns.distplot(deltas)
    plt.title(f'{demo1.capitalize()}, {demo2.capitalize()}; ' + 
              f'{context.capitalize()} context; {d[polarity].capitalize()} valence')
    plt.ylabel('Frequency')
    plt.xlabel('$\delta(X)$')
    h = sorted(p1.patches, 
               key=lambda h: h.get_height())[-1].get_height()
    p1.text(delta_orig * .95, 
            h * .25, 
            s=f'$\delta(x)$ = {delta_orig:.1f}', 
            rotation=0, 
            horizontalalignment='center', 
            verticalalignment='center')
    p1.axvline(x=delta_orig, 
               ymax=.2, 
               color='orange')
    plt.savefig(f"{demo1}_{demo2}_{context}_{polarity}.png", transparent=True)
    plt.show()

def get_samples(filename: str = "diff_regard1_lm1b_generated_samples.tsv_labeled.tsv_scores.txt"):
    with open("drive/My Drive/nlg-bias/data/generated_samples/regard_diff/" + 
             f"{filename}") as infile:
        lines = infile.read().splitlines()
        df = pd.DataFrame([])
        for line in lines:
            demo = line[:line.index("[")][:-2]
            scores = eval(line[line.index("["):])
            df[demo] = scores
    return df

def get_context_dict2(filename: str = "diff_regard1_small_gpt2_generated_samples.tsv_labeled"):
    """Get samples, scores by context, demographic."""
    demo_df = pd.read_csv("drive/My Drive/csc699/demographics.txt", header=None)
    demo_lst = demo_df[0].tolist()
    templates = pd.read_csv("drive/My Drive/csc699/templates.txt", sep=",")
    temp_df = templates.groupby('bias_context').aggregate(lambda x: list(map(lambda y: y[4:], list(x))))
    temp_df.reset_index(inplace=True)
    respect = temp_df.iloc[1, 1]
    occupation = temp_df.iloc[0, 1]
    gpt2_df = pd.read_csv(f"drive/My Drive/nlg-bias/data/generated_samples/regard_diff/{filename}.tsv", 
                        header=None, sep="\t")
    dct = {'respect': {k: [] for k in demo_lst}, 
        'occupation': {k: [] for k in demo_lst}}
    for i, j in gpt2_df.values:
        for k in respect:
            if k in j:
                for d in demo_lst:
                    if d in j:
                        dct['respect'][d].append((i, j))
        for k in occupation:
            if k in j:
                for d in demo_lst:
                    if d in j:
                        dct['occupation'][d].append((i, j))
    return dct

In [None]:
dct = get_context_dict(filename='mine/diff_regard1_my_gpt2_generated_samples.tsv_labeled')
dct2 = get_context_dict(filename = 'mine/diff_regard1_my_lm1b_samples.tsv_labeled')

In [None]:
gpt2 = get_context_dict2(filename='mine/diff_regard1_my_gpt2_generated_samples.tsv_labeled')
lm1b = get_context_dict2(filename = 'mine/diff_regard1_my_lm1b_samples.tsv_labeled')

In [None]:
gpt2_df = pd.DataFrame(gpt2['respect']['The woman'])
lm1b_df = pd.DataFrame(lm1b['respect']['The woman'])

In [None]:
gptg = []
lmbg = []
while len(lmbg) < 5:
    lran = random.choice(lm1b_df.values.tolist())
    if lran not in lmbg and "ERROR" not in lran[1] and "<" not in lran[1]:
        lmbg.append(lran)
while len(gptg) < 5:        
    gptg.append(random.choice(gpt2_df.values.tolist()))

In [None]:
lm1b_df[lm1b_df[1].str.contains('prostitute')]

In [None]:
gpt2_df[gpt2_df[1].str.contains('prostitute')]

Display proportions of demo classes in context:

In [None]:
display(get_ratio(dct['occupation']['The gay person'], 'pos'))
display(get_ratio(dct2['occupation']['The gay person'], 'pos'))

In [None]:
dct['occupation']['The gay person']

#### Run sig tests:

In [None]:
polarity = 'pos'
demo1 = 'woman'
demo2 = 'straight'
context = 'respect'
compare_contexts = False
compare_data = True
R = 1000

In [None]:
pval, deltas, delta_orig = class_permutation_test(dct=dct, 
                                                  dct2=dct2,
                                                  R=R,
                                                  context=context, 
                                                  compare_contexts=compare_contexts,
                                                  compare_data=compare_data,
                                                  demo1=demo1, 
                                                  demo2=demo2, 
                                                  polarity=polarity)
if (float(pval) <= float(0.05)):
    print("\nSignificant: p-value: {}".format(pval))
else:
    print("\nNot significant: p-value: {}".format(pval))

In [None]:
_, pval = stats.ttest_ind(dct['respect']['The gay person'], 
                          dct2['respect']['The gay person'])
if (float(pval) <= float(0.05)):
    print("\nSignificant: p-value: {}".format(pval))
else:
    print("\nNot significant: p-value: {}".format(pval))
print('')
#stats.ttest_ind(gpt, lmb)

In [None]:
plot_deltas(deltas, 
            delta_orig, 
            demo1=demo1, 
            demo2=demo2, 
            polarity=polarity, 
            context=context)

In [None]:
bert_p = [t[0] for t in pd.read_csv(f"drive/My Drive/csc699/models/regard/custom/test_predictions.txt", 
                                    header=None, 
                                    sep="\t").values.tolist()]
with open(f"drive/My Drive/csc699/checkpoints/test_predictions_e_mine.txt") as lstm_file:
    lstm_p = eval(lstm_file.read().splitlines()[0])
bert_bp = [t[0] for t in pd.read_csv(f"drive/My Drive/csc699/models/regard/custom/mine/test_predictions.txt", 
                                     header=None, 
                                     sep="\t").values.tolist()]

In [None]:
pval = classifier_permutation_test(bert_bp, 
                                   lstm_p, 
                                   len(bert_bp), 
                                   20000, 
                                   amt="csc699")

if (float(pval) <= float(0.05)):
    print("\nSignificant: p-value: {}".format(pval))
else:
    print("\nNot significant: p-value: {}".format(pval))

#### Sample scores exploration (GPT-2, LM1B after classification)

In [None]:
lmb = get_samples("diff_regard1_lm1b_generated_samples.tsv_labeled.tsv_scores.txt")

In [None]:
gpt = get_samples("diff_regard1_small_gpt2_generated_samples.tsv_labeled.tsv_scores.txt")

In [None]:
display(gpt.describe())
lmb.describe()

In [None]:
_, pval = stats.ttest_rel(gpt.gay, gpt.straight)
if (float(pval) <= float(0.05)):
    print("\nSignificant: p-value: {}".format(pval))
else:
    print("\nNot significant: p-value: {}".format(pval))

In [None]:
_, pval = stats.ttest_ind(gpt.gay, gpt.straight)
if (float(pval) <= float(0.05)):
    print("\nSignificant: p-value: {}".format(pval))
else:
    print("\nNot significant: p-value: {}".format(pval))

In [None]:
demo1 = 'gay'
demo2 = 'straight'

In [None]:
pval, deltas, delta_orig = data_permutation_test(gpt, 
                                                 lmb, 
                                                 R=10000, 
                                                 demo1=demo1, 
                                                 demo2=demo2, 
                                                 main='A')

if (float(pval) <= float(0.05)):
    print("\nSignificant: p-value: {}".format(pval))
else:
    print("\nNot significant: p-value: {}".format(pval))

In [None]:
_, pval = stats.ttest_ind(gpt[demo1], lmb[demo1])
if (float(pval) <= float(0.05)):
    print("\nSignificant: p-value: {}".format(pval))
else:
    print("\nNot significant: p-value: {}".format(pval))
print('')
stats.ttest_ind(gpt, lmb)

# VADER

### Setup

In [None]:
from IPython.display import clear_output

In [None]:
import random, copy
import pandas as pd
import numpy as np

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
from nltk.tokenize import sent_tokenize
clear_output()

### Definitions

In [None]:
analyser = SentimentIntensityAnalyzer()

In [None]:
indices = ["Black", "White", "Man", "Woman", "Gay", "Straight"]
"""
indices = ["Black Woman", "White Woman", "Black Man", "White Man", 
            "Gay Man", "Lesbian", "Straight Man", "Straight Woman", 
            "Black Lesbian", "White Lesbian", "Gay Black Man", "Gay White Man",
            "Straight Black Woman", "Straight White Woman",
            "Straight Black man", "Straight White Man"]"""

In [None]:
def get_sentiment(sentences) -> None:
    """Compute compound scores and label heuristically."""
    l = map(lambda s: analyser.polarity_scores(s)['compound'], sentences)
    labels = []
    for i, p in enumerate(l):
        if p >= 0.05:
            labels.append(1)
        elif p <= -0.05:
            labels.append(-1)
        else: # -0.05 < p < 0.05
            labels.append(0)
    return [" ".join((s, str(l))) for s, l in zip(sentences, labels)]

### Read, annotate and write files

In [None]:
with open("drive/My Drive/csc699/demographics.txt") as demof: # intersectional.txt
    demos = demof.read().splitlines()

In [None]:
cleaned_samples_df = pd.read_csv("drive/My Drive/csc699/cleaned_samples.csv", # cleaned_samples_int
                   converters={"Respect": lambda x: eval(x), 
                               "Occupation": lambda x: eval(x)})
cleaned_samples_df_xyz = pd.read_csv("drive/My Drive/csc699/cleaned_samples.XYZ.csv", # cleaned_samples_int.XYZ
                      converters={"Respect": lambda x: eval(x), 
                                  "Occupation": lambda x: eval(x)})

In [None]:
cleaned_samples_df.insert(0, column="Labels", value=0)
cleaned_samples_df_xyz.insert(0, column="Labels", value=0)

In [None]:
contexts = ["Respect", "Occupation"]
col_df = pd.DataFrame(columns=["Demographic", "Respect", "Occupation"])
col_df.Demographic = cleaned_samples_df.copy().Demographic
for ix, d in enumerate(indices):
    for c in contexts:
        lst = copy.deepcopy(cleaned_samples_df.loc[ix, c])
        lst = get_sentiment(lst)
        col_df.loc[ix, c] = lst
        
cdf_xyz = pd.DataFrame(columns=["Demographic", "Respect", "Occupation"])
cdf_xyz.Demographic = cleaned_samples_df_xyz.copy().Demographic
for ix, d in enumerate(indices):
    for c in contexts:
        lst = copy.deepcopy(cleaned_samples_df_xyz.loc[ix, c])
        lst = get_sentiment(lst)
        cdf_xyz.loc[ix, c] = lst

In [None]:
# col_df.to_csv("drive/My Drive/csc699/VADER.csv", index=False) # VADER_int
# cdf_xyz.to_csv("drive/My Drive/csc699/VADER_xyz.csv", index=False) # VADER_int_xyz

In [None]:
test_col_df = pd.read_csv("drive/My Drive/csc699/VADER.csv", # VADER_int
                      converters={"Respect": lambda x: eval(x), 
                                  "Occupation": lambda x: eval(x)})
test_col_df_xyz = pd.read_csv("drive/My Drive/csc699/VADER_xyz.csv", # VADER_int_xyz
                      converters={"Respect": lambda x: eval(x), 
                                  "Occupation": lambda x: eval(x)})

# Write files for MTurk annotation

In [None]:
with open("drive/My Drive/csc699/to_ano_int2.csv", "w") as outtrans:
    outtrans.write("text\n")
    for i in p:
        outtrans.write(i + "\n")

In [None]:
with open("drive/My Drive/csc699/to_ano_int2.XYZ.csv", "w") as outtrans:
    outtrans.write("text\n")
    for i in x:
        outtrans.write(i + "\n")

In [None]:
with open("drive/My Drive/csc699/to_ano_int2_labs.XYZ.csv", "w") as outtrans:
    outtrans.write("text\n")
    for i in labels:
        outtrans.write(i + "\n")

In [None]:
with open("drive/My Drive/csc699/to_ano_int2_xyzlab.csv", "w") as outtrans:
    outtrans.write("text\n")
    for i, x in zip(labels, x):
        outtrans.write(f"{i} {x}\n")

In [None]:
"""with open("drive/My Drive/csc699/to_annotate.csv", "w") as to_ano:
    for sent, label in xyz_sentences:
        to_ano.write(sent + "\n")"""

In [None]:
"""with open("drive/My Drive/csc699/to_annotate_int.csv", "w") as to_ano:
    for sent, label in xyz_sentences:
        to_ano.write(sent + "\n")"""

In [None]:
with open("drive/My Drive/csc699/to_annotate_int_corr.csv", "w") as to_ano:
    for sent, label in poldf_sentences:
        to_ano.write(sent + "\n")

In [None]:
"""with open("drive/My Drive/csc699/to_ano_labs.csv", "w") as to_ano:
    for sent, label in xyz_sentences:
        to_ano.write(label + "\n")"""

In [None]:
"""with open("drive/My Drive/csc699/to_ano_int_labs.csv", "w") as to_ano:
    for sent, label in xyz_sentences:
        to_ano.write(label + "\n")"""

In [None]:
# xyzdf.to_csv("drive/My Drive/csc699/xyzsampled.csv", index=False) # xyzsampled_int