In [None]:
import matplotlib.pyplot as plt
import mpl_lego as mplego
import numpy as np
import pandas as pd

from scipy.stats import mannwhitneyu

from hatespeech import utils
from hatespeech.keys import (items,
                             annotator_gender,
                             annotator_gender_col,
                             col_to_gender)
from mpl_lego.colorbar import append_colorbar_to_axis

%matplotlib inline

In [None]:
mplego.style.use_latex_style()

In [None]:
#data_path = '/home/psachdeva/data/scaled-facets-multi-50k-v4.feather'
data_path = '~/data/hatespeech/scaled-facets-multi-50k-v4.feather'

In [None]:
data = pd.read_feather(data_path).rename(columns={'violence_phys': 'violence'})

In [None]:
reference_set = data[data['platform'] == 'reference'].copy()

In [None]:
print(f'Number of unique reference set comments: {reference_set["comment_id"].unique().size}')
print(f'Number of samples in reference set: {reference_set.shape[0]}')

In [None]:
reference_mw = reference_set[
    (reference_set[annotator_gender_col] == annotator_gender['male']) | 
    (reference_set[annotator_gender_col] == annotator_gender['female'])
]
reference_mw = utils.filter_comments_targeting_mw(reference_mw, threshold=0.5)

In [None]:
reference_mw_targeting_w = reference_mw[reference_mw['target_women']].copy()
reference_mw_targeting_m = reference_mw[~reference_mw['target_women']].copy()

In [None]:
print(f'Number of samples whose annotators are men or women: {reference_mw.shape[0]}')
print(f'Number of samples targeting women whose annotators are men or women: {reference_mw_targeting_w.shape[0]}')
print(f'Number of samples targeting men whose annotators are men or women: {reference_mw_targeting_m.shape[0]}')
print(f'Number of unique comments targeting women: {reference_mw_targeting_w["comment_id"].unique().size}')
print(f'Number of unique comments targeting men: {reference_mw_targeting_m["comment_id"].unique().size}')

In [None]:
reference_mw_targeting_w['annotator_gender'] = np.where(
    reference_mw_targeting_w[annotator_gender_col] == annotator_gender['male'],
    'male',
    'female')

In [None]:
mw_targeting_w_comments = np.sort(reference_mw_targeting_w['comment_id'].unique())
mw_targeting_w_counts = reference_mw_targeting_w.groupby(['comment_id', 'annotator_gender']).count()
mw_targeting_w_means = reference_mw_targeting_w.groupby(['comment_id', 'annotator_gender']).mean()[items]
mw_targeting_w_diffs = mw_targeting_w_means.diff().query("annotator_gender == 'male'").droplevel('annotator_gender')

In [None]:
p_values = np.zeros((mw_targeting_w_comments.size, len(items)))

for ii, comment_id in enumerate(mw_targeting_w_comments):
    for jj, item in enumerate(items):
        women_scores = reference_mw_targeting_w[
            (reference_mw_targeting_w['comment_id'] == comment_id) &
            (reference_mw_targeting_w['annotator_gender'] == 'male')
        ][item]
        men_scores = reference_mw_targeting_w[
            (reference_mw_targeting_w['comment_id'] == comment_id) &
            (reference_mw_targeting_w['annotator_gender'] == 'female')
        ][item]
        try:
            p_values[ii, jj] = mannwhitneyu(women_scores, men_scores).pvalue
        except:
            p_values[ii, jj] = 1
        
p_values = pd.DataFrame(data=p_values, columns=items, index=mw_targeting_w_comments)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 12))

img = ax.imshow(mw_targeting_w_diffs, vmin=-0.4, vmax=0.4, cmap='RdGy')
cb, cax = append_colorbar_to_axis(ax, img)
cb.set_label(r'$\Delta$item (men - women)', rotation=270, fontsize=20, labelpad=25)
cb.ax.tick_params(labelsize=17)

for ii in range(p_values.shape[0]):
    for jj in range(p_values.shape[1]):
        if p_values.iloc[ii, jj] < 0.05:
            ax.scatter(jj, ii, marker='x', color='black', s=50)
            
ax.set_xticks(np.arange(len(items)))
ax.set_xticklabels(mplego.labels.bold_text(mplego.labels.fix_labels_for_tex_style(items)), rotation=30, ha='right')
ax.set_yticks(np.arange(len(mw_targeting_w_diffs)))
ax.set_yticklabels(mw_targeting_w_diffs.index)
ax.tick_params(labelsize=16)

ax.set_ylabel(r'\textbf{Comment ID}', fontsize=18)
ax.set_xlabel(r'\textbf{Item}', fontsize=18)
ax.tick_params(labelsize=16)
plt.savefig('men_v_women_comment_comparison.pdf', bbox_inches='tight')
plt.show()

In [None]:
comments_w_diffs = pd.concat(
    [reference_mw_targeting_w[reference_mw_targeting_w['comment_id'].isin(mw_targeting_w_comments)][['comment_id', 'predict_text']].sort_values('comment_id').drop_duplicates().reset_index(drop=True),
     (p_values < 0.05).reset_index(drop=True).add_suffix('_sig'),
     mw_targeting_w_diffs.reset_index(drop=True).add_suffix('_diff')],
    axis=1)

In [None]:
comments_w_diffs.to_csv('men_v_women_comments.csv')