In [None]:
import os, json, time, pickle
import pandas as pd, numpy as np 
import matplotlib.pyplot as plt
from collections import Counter

os.chdir('/home/jovyan/work/')

modelname = 'mistral_'
indatadir = os.path.join('personas', 'mistraldata_llm_4')
outdatadir = os.path.join('personas', 'mistral_study_4')
resultsdir = os.path.join('personas', 'mistralresults')

#### data for annotation

In [None]:
pd_data = pd.read_pickle('personas/data_ext/data_study_4.pkl')

In [None]:
pd_data = pd_data[(pd_data['isAntiBlack']==True)|(pd_data['isAAE']==True)]
pd_data = pd_data.sort_values('postId').reset_index(drop=True)
pd_data = pd_data.drop('race', axis=1)
len(pd_data)

In [None]:
pd_data.head(3)

In [None]:
pd_data.to_pickle('personas/data_ext/data_study_4.pkl')

#### annotator personas

personas black and conservative

In [None]:
pd_blacks = pd.read_excel('personas/data_ext/candidates_black_annotated.xlsx')

In [None]:
pd_conservatives = pd.read_excel('personas/data_ext/candidates_conservative_annotated.xlsx', index_col=0)

find candidates for neutral personas 

In [None]:
pd_personas = pd.read_pickle('personas/data_ext/pd_personas_cleaned.pkl')

exclude blacks and conservatives

In [None]:
pd_neutral_candidates = pd_personas[~pd_personas['personaId'].isin(pd_blacks['personaId'])]
pd_neutral_candidates = pd_neutral_candidates[~pd_neutral_candidates['personaId'].isin(pd_conservatives['personaId'])]

exclude personas with an explicit political leaning or an explicit localization outside the united states

In [None]:
pd_neutral_candidates = pd_neutral_candidates.sample(1000, random_state=1)

In [None]:
pd_neutral_candidates.to_excel('personas/data_ext/candidates_neutral.xlsx')

personas neutral

In [None]:
pd_neutrals = pd.read_excel('personas/data_ext/candidates_neutral_annotated.xlsx', index_col=0)

In [None]:
pd_neutrals.head()

In [None]:
pd_neutrals.columns

define annotator groups

In [None]:
pd_conservatives.head(2)

##### load annotators

In [None]:
pd_neutrals = pd.read_excel('personas/data_ext/candidates_neutral_annotated.xlsx', index_col=0)

In [None]:
pd_blacks.head()

In [None]:
annotators = {
    'neutral_original': pd_neutrals[['personaId','persona']],
    'neutral_black': pd_neutrals[['personaId','neutral']].rename(columns={'neutral': 'persona'}),
    'neutral_conservative': pd_neutrals[['personaId','neutral']].rename(columns={'neutral': 'persona'})
}

annotators['neutral_black']['persona'] = [p.replace('[TOKEN]', 'african-american').replace('[ATOKEN]', 'an african-american') for p in annotators['neutral_black']['persona']]
annotators['neutral_conservative']['persona'] = [p.replace('[TOKEN]', 'conservative').replace('[ATOKEN]', 'a conservative') for p in annotators['neutral_conservative']['persona']]

In [None]:
with open('personas/data_ext/dict_annotators.pkl', 'wb') as f:
    pickle.dump(annotators, f)

In [None]:
annotators['neutral_original'].head(3)

In [None]:
annotators['neutral_conservative'].head(3)

In [None]:
annotators['neutral_black'].head(3)

#### load annotations

In [None]:
list_annotations = [
    'neutral_black',
    'neutral_neutral',
    'neutral_conservative',
]

In [None]:
dict_annotations = {k: pd.read_pickle(os.path.join(indatadir,f'{k}_annotations.pkl')) for k in list_annotations}

#### original personas

anti black

In [None]:
pd_data = pd.read_pickle('personas/data_ext/data_study_4.pkl')
pd_data = pd_data[(pd_data['isAntiBlack']==True)&(pd_data['isAAE']==False)]
len(pd_data)

In [None]:
from scipy import stats

In [None]:
ttype = 'isAntiBlack'
ntype = 'isAAE'

personas = 'neutral_black'
nb = dict_annotations[personas]
nb = nb[(nb[ttype]==True)&((nb[ntype]==False))].iloc[:,10:].astype(int)
nb_means = nb.mean()
print('b',np.round(nb.mean().mean(),2))

personas = 'neutral_neutral'
nn = dict_annotations[personas]
nn = nn[(nn[ttype]==True)&((nn[ntype]==False))].iloc[:,10:].astype(int)
nn_means = nn.mean()
print('n',np.round(nn.mean().mean(),2))

personas = 'neutral_conservative'
nc = dict_annotations[personas]
nc = nc[(nc[ttype]==True)&((nc[ntype]==False))].iloc[:,10:].astype(int)
nc_means = nc.mean()
print('c',np.round(nc.mean().mean(),2))

print(np.sum(nb.mean() > nn.mean()))
print(np.sum(nb.mean() > nc.mean()))
print(len(nb.columns))

In [None]:
diff_nb = nb.mean() - nn.mean()
diff_nc = nc.mean() - nn.mean()

dict_diffs = {'black_neutral': diff_nb, 'cons_neutral': diff_nc}

plt.boxplot(dict_diffs.values())
plt.show()

In [None]:
stats.ttest_ind(nb_means, nn_means, alternative='greater')

In [None]:
stats.ttest_ind(nb_means, nc_means, alternative='greater')

In [None]:
pd_data['tox_nb'] = nb.mean(axis=1)
pd_data['tox_nn'] = nn.mean(axis=1)
pd_data['diff'] = nb.mean(axis=1) - nn.mean(axis=1)

In [None]:
pd_data.sort_values('diff', ascending=False)

In [None]:
for text in pd_data.sort_values('diff', ascending=False).iloc[:5,1]:
    print(text)
    print('_____________________________')

In [None]:
for text in pd_data.sort_values('diff', ascending=False).iloc[-5:,1]:
    print(text)
    print('_____________________________')

aae

In [None]:
pd_data = pd.read_pickle('personas/data_ext/data_study_4.pkl')
pd_data = pd_data[(pd_data['isAntiBlack']==False)&(pd_data['isAAE']==True)]
len(pd_data)

In [None]:
ttype = 'isAAE'
ntype = 'isAntiBlack'

personas = 'neutral_black'
nb = dict_annotations[personas]
nb = nb[(nb[ttype]==True)&((nb[ntype]==False))].iloc[:,10:].astype(int)
nb_means = nb.mean()
print('b',np.round(nb.mean().mean(),2))

personas = 'neutral_neutral'
nn = dict_annotations[personas]
nn = nn[(nn[ttype]==True)&((nn[ntype]==False))].iloc[:,10:].astype(int)
nn_means = nn.mean()
print('n',np.round(nn.mean().mean(),2))

personas = 'neutral_conservative'
nc = dict_annotations[personas]
nc = nc[(nc[ttype]==True)&((nc[ntype]==False))].iloc[:,10:].astype(int)
nc_means = nc.mean()
print('c',np.round(nc.mean().mean(),2))

print(np.sum(nb.mean() > nn.mean()))
print(np.sum(nb.mean() > nc.mean()))
print(len(nb.columns))

In [None]:
diff_nb = nb.mean() - nn.mean()
diff_nc = nc.mean() - nn.mean()

dict_diffs = {'black_neutral': diff_nb, 'cons_neutral': diff_nc}

plt.boxplot(dict_diffs.values())
plt.show()

In [None]:
stats.ttest_ind(nb_means, nn_means, alternative='less') # h0: nb >= nn; h1: nb < nn -> do not reject h0

In [None]:
stats.ttest_ind(nb_means, nc_means, alternative='less') # h0: nb >= nc; h1: nb < nc -> reject h0

In [None]:
pd_data['tox_nb'] = nb.mean(axis=1)
pd_data['tox_nn'] = nn.mean(axis=1)
pd_data['diff'] = nb.mean(axis=1) - nn.mean(axis=1)

In [None]:
pd_data.sort_values('diff', ascending=False)

In [None]:
for text in pd_data.sort_values('diff', ascending=False).iloc[:5,1]:
    print(text)
    print('_____________________________')

In [None]:
for text in pd_data.sort_values('diff', ascending=False).iloc[-5:,1]:
    print(text)
    print('_____________________________')

##### bar charts

In [None]:
texttype = 'isAAE'
ntexttype = 'isAntiBlack'
width = 0.25
colors = ['lightsteelblue', 'cornflowerblue', 'mediumblue']

fig, ax = plt.subplots(figsize=(8,5))
multiplier = -1
for group in ['black','neutral','conservative']:
    v = dict_annotations[f'{group}_{group}'].copy() 
    data = v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int)
    list_data = []
    for l in data.values:
        list_data.extend(l)
    pd_data = pd.DataFrame([[k,v] for k,v in Counter(list_data).items()], columns=['level','count']).sort_values('level')
    offset = width * multiplier
    col = colors[multiplier+1]
    ax.bar(pd_data['level']+offset, pd_data['count'], width=width, label=group, color=col)
    multiplier += 1
ax.set_title(f'{texttype}')
ax.legend(title='personas')
plt.savefig(os.path.join(resultsdir,modelname+f'study_4_bars_{texttype}_original.png'))
plt.show()

In [None]:
texttype = 'isAntiBlack'
ntexttype = 'isAAE'
width = 0.25
colors = ['lightsteelblue', 'cornflowerblue', 'mediumblue']

fig, ax = plt.subplots(figsize=(8,5))
multiplier = -1
for group in ['black','neutral','conservative']:
    v = dict_annotations[f'{group}_{group}'].copy() 
    data = v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int)
    list_data = []
    for l in data.values:
        list_data.extend(l)
    pd_data = pd.DataFrame([[k,v] for k,v in Counter(list_data).items()], columns=['level','count']).sort_values('level')
    offset = width * multiplier
    col = colors[multiplier+1]
    ax.bar(pd_data['level']+offset, pd_data['count'], width=width, label=group, color=col)
    multiplier += 1
ax.set_title(f'{texttype}')
ax.legend(title='personas')
plt.savefig(os.path.join(resultsdir,modelname+f'study_4_bars_{texttype}_original.png'))
plt.show()

##### boxplots

In [None]:
texttype = 'isAAE'
ntexttype = 'isAntiBlack'

dict_boxplots = {k: v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int).mean(axis=0) for k,v in dict_annotations.items()}
dict_boxplots = {k: v for k,v in dict_boxplots.items() if k in ['black_black', 'neutral_neutral', 'conservative_conservative'] }

fig, ax = plt.subplots(figsize=(5,3))
ax.boxplot(dict_boxplots.values())
ax.set_xticklabels([l.split('_')[0].replace('conservative','cons.') for l in list(dict_boxplots.keys())], rotation=45)
ax.set_ylim((0.8,5.2))
ax.set_title(f'{texttype}')
plt.savefig(os.path.join(resultsdir,modelname+f'study_4_boxplots_{texttype}_original.png'), bbox_inches='tight')
plt.show()

In [None]:
texttype = 'isAntiBlack'
ntexttype = 'isAAE'

dict_boxplots = {k: v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int).mean(axis=0) for k,v in dict_annotations.items()}
dict_boxplots = {k: v for k,v in dict_boxplots.items() if k in ['black_black', 'neutral_neutral', 'conservative_conservative'] }

fig, ax = plt.subplots(figsize=(5,3))
ax.boxplot(dict_boxplots.values())
ax.set_xticklabels([l.split('_')[0].replace('conservative','cons.') for l in list(dict_boxplots.keys())], rotation=45)
ax.set_ylim((0.8,5.2))
ax.set_title(f'{texttype}')
plt.savefig(os.path.join(resultsdir,modelname+f'study_4_boxplots_{texttype}_original.png'), bbox_inches='tight')
plt.show()

#### extended personas

##### bar charts: number of toxicity levels per annotator group

In [None]:
texttype = 'isAAE'
ntexttype = 'isAntiBlack'
width = 0.25
colors = ['lightsteelblue', 'cornflowerblue', 'mediumblue']

for group_o in ['black','neutral','conservative']:
    fig, ax = plt.subplots(figsize=(8,5))
    multiplier = -1
    for group_n in ['black', 'neutral', 'conservative']:
        v = dict_annotations[f'{group_o}_{group_n}'].copy() 
        data = v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int)
        list_data = []
        for l in data.values:
            list_data.extend(l)
        pd_data = pd.DataFrame([[k,v] for k,v in Counter(list_data).items()], columns=['level','count']).sort_values('level')
        offset = width * multiplier
        col = 'rosybrown' if group_o == group_n else colors[multiplier+1]
        ax.bar(pd_data['level']+offset, pd_data['count'], width=width, label=group_n, color=col)
        multiplier += 1
    ax.set_title(f'{texttype} --- original personas: {group_o}')
    ax.legend(title='new personas')
    plt.savefig(os.path.join(resultsdir,modelname+f'study_4_bars_{texttype}_extended_{group_o}.png'))

In [None]:
texttype = 'isAntiBlack'
ntexttype = 'isAAE'
width = 0.25
colors = ['lightsteelblue', 'cornflowerblue', 'mediumblue']

for group_o in ['black','neutral','conservative']:
    fig, ax = plt.subplots(figsize=(8,5))
    multiplier = -1
    for group_n in ['black', 'neutral', 'conservative']:
        v = dict_annotations[f'{group_o}_{group_n}'].copy() 
        data = v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int)
        list_data = []
        for l in data.values:
            list_data.extend(l)
        pd_data = pd.DataFrame([[k,v] for k,v in Counter(list_data).items()], columns=['level','count']).sort_values('level')
        offset = width * multiplier
        col = 'rosybrown' if group_o == group_n else colors[multiplier+1]
        ax.bar(pd_data['level']+offset, pd_data['count'], width=width, label=group_n, color=col)
        multiplier += 1
    ax.set_title(f'{texttype} --- original personas: {group_o}')
    ax.legend(title='new personas')
    plt.savefig(os.path.join(resultsdir,modelname+f'study_4_bars_{texttype}_extended_{group_o}.png'))

##### boxplots: mean toxicity level per annotator

In [None]:
texttype = 'isAAE'
ntexttype = 'isAntiBlack'

dict_boxplots = {k: v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int).mean(axis=0) for k,v in dict_annotations.items()}

fig, ax = plt.subplots(figsize=(10,5))
ax.boxplot(dict_boxplots.values())
xlabels = [l.replace('conservative','cons.') for l in list(dict_boxplots.keys())]
ax.set_xticklabels(xlabels, rotation=45)
ax.set_ylim((0.8,5.2))
ax.set_title(f'{texttype}')
plt.savefig(os.path.join(resultsdir,modelname+f'study_4_boxplots_{texttype}_extended.png'), bbox_inches='tight')
plt.show()

In [None]:
texttype = 'isAntiBlack'
ntexttype = 'isAAE'

dict_boxplots = {k: v[(v[texttype]==True)&(v[ntexttype]==False)].iloc[:,10:].astype(int).mean(axis=0) for k,v in dict_annotations.items()}

fig, ax = plt.subplots(figsize=(10,5))
ax.boxplot(dict_boxplots.values())
xlabels = [l.replace('conservative','cons.') for l in list(dict_boxplots.keys())]
ax.set_xticklabels(xlabels, rotation=45)
ax.set_ylim((0.8,5.2))
ax.set_title(f'{texttype}')
plt.savefig(os.path.join(resultsdir,modelname+f'study_4_boxplots_{texttype}_extended.png'), bbox_inches='tight')
plt.show()