In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install constant
!pip install bert-pytorch
!pip install pytorch-pretrained-bert pytorch-nlp
!pip install -U -q PyDrive

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score
from sklearn.utils.class_weight import compute_class_weight
from keras.layers import Input, Dense, Dropout
from keras.models import Model
import pandas as pd
import numpy as np
import os, sys
import constant

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import warnings
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam, BertModel
from pytorch_pretrained_bert import BertConfig
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

# FakeNewsNet

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Gender Bias/FakeNewsNet2.csv')

In [None]:
label_mapping = {'fake': 1, 'real': 0}
df['label'] = df['label'].map(label_mapping)

In [None]:
df['female'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)
df['male'] = df['gender'].apply(lambda x: 1 if x == 'male' else 0)

In [None]:
df_train_total , df_val_total = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
df_train_total

In [None]:
df_train = df_train_total
df_val = df_val_total

In [None]:
comments_train = df_train.title
comments_val = df_val.title

In [None]:
def extract_female_gender(x):
  if np.isnan(x.female) or x.female < 0.5:
      return 0
  else:
    return 1

In [None]:
def get_unprotected_class(list_of_protected):
  new = [1 if i == 0 else 0 for i in list_of_protected]
  return new

In [None]:
def get_metrics(labels, preds):
  pred_flat = preds.flatten()
  labels_flat = labels.flatten()

  acc = accuracy_score(labels_flat, pred_flat)
  pre = precision_score(labels_flat, pred_flat)
  rec = recall_score(labels_flat, pred_flat)
  f1 = f1_score(labels_flat, pred_flat, average="weighted")

  return acc, pre, rec, f1

In [None]:
fakness_labels_train = list(df_train.label.apply(lambda x: 1 if x >= 0.5 else 0))
identity_labels_train = list(df_train.apply(extract_female_gender, axis = 1))
fakness_labels_val = list(df_val.label.apply(lambda x: 1 if x >= 0.5 else 0))
identity_labels_val = list(df_val.apply(extract_female_gender, axis = 1))
unprotected_labels_train = get_unprotected_class(identity_labels_train)
unprotected_labels_val = get_unprotected_class(identity_labels_val)

In [None]:
print(len(comments_train), len(fakness_labels_train))
print(comments_train[:10])
print(fakness_labels_train[:10])
print(identity_labels_train[:10])

In [None]:
MAX_SEQUENCE_LENGTH = 128
SEED = 519
BATCH_SIZE = 32
BERT_MODEL_PATH = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)

In [None]:
def convert_lines(example, max_seq_length,tokenizer):

    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print("Tokens longer than max_length: ", longer)
    return np.array(all_tokens)


In [None]:
input_train = convert_lines(comments_train.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
fakness_labels_train = torch.tensor(fakness_labels_train)
female_labels_train = torch.tensor(identity_labels_train)

input_val = convert_lines(comments_val.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
fakness_labels_val = torch.tensor(fakness_labels_val)
female_labels_val = torch.tensor(identity_labels_val)


In [None]:
print(torch.sum(fakness_labels_train).data)
print(torch.sum(female_labels_train).data)

print(torch.sum(fakness_labels_val).data)
print(torch.sum(female_labels_val).data)


In [None]:
X_train = torch.utils.data.TensorDataset(torch.tensor(input_train, dtype=torch.long), fakness_labels_train, female_labels_train)
train_loader = torch.utils.data.DataLoader(X_train, batch_size=32, shuffle=True)

X_val = torch.utils.data.TensorDataset(torch.tensor(input_val, dtype=torch.long), fakness_labels_val, female_labels_val)
val_loader = torch.utils.data.DataLoader(X_val, batch_size=32, shuffle=True)

In [None]:
def get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

  def get_fake_rates(y_pred, protected_labels, non_protected_labels, thres):
    protected_ops = y_pred[protected_labels == 1]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[non_protected_labels == 1]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)

  def get_true_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels == 1)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 1)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)


  def get_false_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels ==0)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 0)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)

  def demographic_parity(y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[protected_labels == 1]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[non_protected_labels == 1]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)

  def true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels == 1)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 1)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)


  def false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels ==0)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 0)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)


  def equalized_odds(actual_labels, y_pred, protected_labels, non_protected_labels, thres):
    return true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres) + false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

  female_tox_rate, nf_tox_rate = get_fake_rates(y_pred, protected_labels, non_protected_labels, thres)
  female_tp_rate, nf_tp_rate = get_true_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  female_fp_rate, nf_fp_rate = get_false_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  demo_parity = demographic_parity(y_pred, protected_labels, non_protected_labels, thres)
  tp_parity = true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  fp_parity = false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  equ_odds = equalized_odds(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

  return female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds



In [None]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072,
        hidden_dropout_prob=0.1)

class Classifier(nn.Module):
    def __init__(self, fakness_labels = 2):
        super(Classifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.c1 = nn.Linear(config.hidden_size, 324)
        self.c3 = nn.Linear(324, fakness_labels)

        nn.init.xavier_normal_(self.c1.weight)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):



        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)


        classifier_prev_output = F.relu(self.c1(pooled_output))
        classifier_output = self.c3(classifier_prev_output)

        return classifier_output, classifier_prev_output

class Adversary(nn.Module):
    def __init__(self, identity_labels = 2):
        super(Adversary, self).__init__()

        self.a1 = nn.Linear(324,120)
        self.a2 = nn.Linear(120, identity_labels)

        nn.init.xavier_normal_(self.a1.weight)

    def forward(self, input_ids):
        adversary = F.relu(self.a1(input_ids))
        adversary_output = self.a2(adversary)

        return adversary_output


In [None]:
def conduct_validation(net, data_loader, adv = False):

    eval_loss, eval_accuracy, eval_precision, eval_recall, eval_f1 = 0, 0, 0, 0, 0
    nb_eval_steps = 0

    predictions_net = np.empty((0,))
    truths = np.empty((0,))
    identities = np.empty((0,))
    correct_net = 0
    total = 0

    net.eval()
    with torch.no_grad():
      for index, data in enumerate(data_loader):

        text, fake_truth, female_truth = data

        text = text.to(device)
        fake_truth = fake_truth.to(device)
        female_truth = female_truth.to(device)

        if adv:
          net_outputs, net_prev_outputs = net(text)
        else:
          net_outputs = net(text)
        _, net_predicted = torch.max(net_outputs.data, 1)

        batch_size = fake_truth.size(0)
        total += batch_size
        correct_net_batch = (net_predicted == fake_truth).sum().item()
        correct_net += correct_net_batch


        predictions_net = np.concatenate((predictions_net, net_predicted.cpu().numpy()))
        truths = np.concatenate((truths, fake_truth.cpu().numpy()))
        identities = np.concatenate((identities, female_truth.cpu().numpy()))

        pred = net_predicted.detach().cpu().numpy()
        label_ids = fake_truth.to('cpu').numpy()

        tmp_eval_accuracy, tmp_eval_precision, temp_eval_recall, tmp_eval_f1 = get_metrics(label_ids, pred)

        eval_accuracy += tmp_eval_accuracy
        eval_precision += tmp_eval_precision
        eval_recall += temp_eval_recall
        eval_f1 += tmp_eval_f1
        nb_eval_steps += 1

    f1_score = eval_f1/nb_eval_steps
    prec_score = eval_precision/nb_eval_steps
    recall_score = eval_recall/nb_eval_steps
    acc_score = eval_accuracy/nb_eval_steps

    print("F1 Score: ", f1_score)
    print("Precision Score: ", prec_score)
    print("Recall Score: ", recall_score)
    print("Acc Score: ", acc_score, "\n\n")

    net.train()

    return (predictions_net, truths, identities, acc_score)

In [None]:
def pretrain_classifier(clf, optimizer_clf, train_loader, loss_criterion, epochs):

  pretrain_classifier_loss = 0
  steps = 0

  for epoch in range(epochs):

    print("Epoch: ", epoch + 1)
    epoch_loss = 0
    epoch_batches = 0

    for i, data in enumerate(train_loader):

        inputs, fake_true, female_true = data
        inputs = inputs.to(device)

        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_clf.zero_grad()

        classifier_output, _ = clf(inputs)
        classifier_loss = loss_criterion(classifier_output, fake_true)
        classifier_loss.backward()
        optimizer_clf.step()
        pretrain_classifier_loss += classifier_loss.item()
        epoch_loss += classifier_loss.item()
        epoch_batches += 1
        steps += 1

    print("Average Pretrain Classifier epoch loss: ", epoch_loss/epoch_batches)
  print("Average Pretrain Classifier batch loss: ", pretrain_classifier_loss/steps)

  return clf

In [None]:
def pretrain_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs):

  pretrain_adversary_loss = 0
  steps = 0

  for epoch in range(epochs):

    print("Epoch: ", epoch + 1)
    epoch_loss = 0
    epoch_batches = 0
    for i, data in enumerate(train_loader):

        inputs, fake_true, female_true = data
        inputs = inputs.to(device)

        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_adv.zero_grad()

        _, classifier_prev_output = clf(inputs)
        adversary_output = adv(classifier_prev_output)
        adversary_loss = loss_criterion(adversary_output, female_true)
        adversary_loss.backward()
        optimizer_adv.step()
        pretrain_adversary_loss += adversary_loss.item()
        epoch_loss += adversary_loss.item()
        epoch_batches += 1
        steps += 1

    print("Average Pretrain Adversary epoch loss: ", epoch_loss/epoch_batches)
  print("Average Pretrain Adversary batch loss: ", pretrain_adversary_loss/steps)

  return adv

In [None]:
def train_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs=1):

  adv_loss = 0
  steps = 0

  for epoch in range(epochs):
    for i, data in enumerate(train_loader):

        inputs, fake_true, female_true = data
        inputs = inputs.to(device)
        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_adv.zero_grad()

        classifier_output, classifier_prev_output = clf(inputs)
        adversary_output = adv(classifier_prev_output)
        adversary_loss = loss_criterion(adversary_output, female_true)
        adversary_loss.backward()
        optimizer_adv.step()
        adv_loss += adversary_loss.item()
        steps += 1

  print("Average Adversary batch loss: ", adv_loss/steps)

  return adv

In [None]:
def train_classifier(clf, optimizer_clf, adv, train_loader, loss_criterion, lbda):

  for i, data in enumerate(train_loader):

      inputs, fake_true, female_true = data
      inputs = inputs.to(device)

      fake_true = fake_true.to(device)
      female_true = female_true.to(device)


      optimizer_clf.zero_grad()

      classifier_output, classifier_prev_output = clf(inputs)
      adversary_output = adv(classifier_prev_output)
      adversary_loss = loss_criterion(adversary_output, female_true)
      classifier_loss = loss_criterion(classifier_output, fake_true)
      total_classifier_loss = classifier_loss - lbda * adversary_loss
      total_classifier_loss.backward()

      optimizer_clf.step()

      print("Adversary Mini-Batch loss: ", adversary_loss.item())
      print("Classifier Mini-Batch loss: ", classifier_loss.item())
      print("Total Mini-Batch loss: ", total_classifier_loss.item())

      break

  return clf

In [None]:
lambda_params = [0.1, 0.2, 0.3, 0.5 1,2,3,5,7,10]

lbda_train_accs = []
lbda_valid_accs = []
protected_fake_rates = []
unprotected_fake_rates = []
protected_tp_rates = []
unprotected_tp_rates = []
protected_fp_rates = []
unprotected_fp_rates = []
demo_parity_scores = []
tp_parity_scores = []
fp_parity_scores = []
equ_odds_scores = []

for lbda in lambda_params:


  clf = Classifier(fakness_labels = 2)
  adv = Adversary(identity_labels = 2)

  loss_criterion = torch.nn.CrossEntropyLoss()


  optimizer_adv = optim.Adam(adv.parameters(), lr=0.001)

  lrlast = .001
  lrmain = .00001
  optimizer_clf = optim.Adam(
      [
          {"params":clf.bert.parameters(),"lr": lrmain},
          {"params":clf.c1.parameters(), "lr": lrlast},

      {"params":clf.c3.parameters(), "lr": lrlast}
    ])

  clf.to(device)
  adv.to(device)



  for param in adv.parameters():
    param.requires_grad = False

  clf = pretrain_classifier(clf, optimizer_clf, train_loader, loss_criterion, 3)

  for param in adv.parameters():
    param.requires_grad = True



  for param in clf.parameters():
    param.requires_grad = False

  adv = pretrain_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, 3)

  for param in clf.parameters():
    param.requires_grad = True

  print('Lambda: ' + str(lbda))

  train_accs = []
  valid_accs = []
  iterations = 20

  for iteration in range(iterations):
      print("Iteration: ", iteration)



      for param in clf.parameters():
        param.requires_grad = False

      adv = train_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs=1)

      for param in clf.parameters():
        param.requires_grad = True


      for param in adv.parameters():
        param.requires_grad = False

      clf = train_classifier(clf, optimizer_clf, adv, train_loader, loss_criterion, lbda)

      for param in adv.parameters():
        param.requires_grad = True

      if (iteration + 1) % 2 == 0:

        print('Training metrics:')
        y_pred, actual_labels, protected_labels, acc_score = conduct_validation(clf, train_loader, adv = True)
        train_accs.append(acc_score)

        print("\n")
        print("Fairness Metrics on Train:")
        non_protected_labels = np.asarray(get_unprotected_class(protected_labels))
        thres = 0.5
        female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds =\
        get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

        print("Fake Prediction Rates: ", "Female -", female_tox_rate, "Non-Female - ", nf_tox_rate)
        print("True Positive Prediction Rates: ", "Female -", female_tp_rate, "Non-Female - ", nf_tp_rate)
        print("False Positive Prediction Rates: ", "Female -", female_fp_rate, "Non-Female - ", nf_fp_rate)
        print("Demographic Parity: ", demo_parity)
        print("True Positive Parity: ", tp_parity)
        print("False Positive Parity: ", fp_parity)
        print("Equalized Odds: ", equ_odds)
        print("\n")
        print('Validation metrics:')
        y_pred, actual_labels, protected_labels, acc_score = conduct_validation(clf, val_loader, adv = True)
        valid_accs.append(acc_score)

        print("\n")
        print("Fairness Metrics on Validation:")
        non_protected_labels = np.asarray(get_unprotected_class(protected_labels))
        thres = 0.5
        female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds =\
        get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

        print("Fake Prediction Rates: ", "Female -", female_tox_rate, "Non-Female - ", nf_tox_rate)
        print("True Positive Prediction Rates: ", "Female -", female_tp_rate, "Non-Female - ", nf_tp_rate)
        print("False Positive Prediction Rates: ", "Female -", female_fp_rate, "Non-Female - ", nf_fp_rate)
        print("Demographic Parity: ", demo_parity)
        print("True Positive Parity: ", tp_parity)
        print("False Positive Parity: ", fp_parity)
        print("Equalized Odds: ", equ_odds)
        print("\n\n\n__________________")

        if iteration == iterations -1:
          protected_fake_rates.append(female_tox_rate)
          unprotected_fake_rates.append(nf_tox_rate)
          protected_tp_rates.append(female_tp_rate)
          unprotected_tp_rates.append(nf_tp_rate)
          protected_fp_rates.append(female_fp_rate)
          unprotected_fp_rates.append(nf_fp_rate)
          demo_parity_scores.append(demo_parity)
          tp_parity_scores.append(tp_parity)
          fp_parity_scores.append(fp_parity)
          equ_odds_scores.append(equ_odds)

  lbda_train_accs.append(train_accs)
  lbda_valid_accs.append(valid_accs)


print('Finished Training')

In [None]:
torch.save(clf.state_dict(), "/content/drive/MyDrive/Gender Bias/saved_models/SC_Classifier_Final_FakeNewsNet")
torch.save(adv.state_dict(), "/content/drive/MyDrive/Gender Bias/saved_models/SC_Adversary_Final_FakeNewsNet")

# COVID19

In [None]:
df = pd.read_csv ('/content/drive/MyDrive/Gender Bias/covid.csv')

In [None]:
label_mapping = {'fake': 1, 'real': 0}
df['label'] = df['label'].map(label_mapping)

In [None]:
df['female'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)
df['male'] = df['gender'].apply(lambda x: 1 if x == 'male' else 0)

In [None]:
df_train_total , df_val_total = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
df_train_total

In [None]:
df_train = df_train_total
df_val = df_val_total

In [None]:
comments_train = df_train.tweet
comments_val = df_val.tweet

In [None]:
def extract_female_gender(x):
  if np.isnan(x.female) or x.female < 0.5:
      return 0
  else:
    return 1

In [None]:
def get_unprotected_class(list_of_protected):
  new = [1 if i == 0 else 0 for i in list_of_protected]
  return new

In [None]:
def get_metrics(labels, preds):
  pred_flat = preds.flatten()
  labels_flat = labels.flatten()

  acc = accuracy_score(labels_flat, pred_flat)
  pre = precision_score(labels_flat, pred_flat)
  rec = recall_score(labels_flat, pred_flat)
  f1 = f1_score(labels_flat, pred_flat, average="weighted")

  return acc, pre, rec, f1

In [None]:
fakness_labels_train = list(df_train.label.apply(lambda x: 1 if x >= 0.5 else 0))
identity_labels_train = list(df_train.apply(extract_female_gender, axis = 1))
fakness_labels_val = list(df_val.label.apply(lambda x: 1 if x >= 0.5 else 0))
identity_labels_val = list(df_val.apply(extract_female_gender, axis = 1))
unprotected_labels_train = get_unprotected_class(identity_labels_train)
unprotected_labels_val = get_unprotected_class(identity_labels_val)

In [None]:
print(len(comments_train), len(fakness_labels_train))
print(comments_train[:10])
print(fakness_labels_train[:10])
print(identity_labels_train[:10])

In [None]:
MAX_SEQUENCE_LENGTH = 128
SEED = 519
BATCH_SIZE = 32
BERT_MODEL_PATH = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)

In [None]:
def convert_lines(example, max_seq_length,tokenizer):

    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print("Tokens longer than max_length: ", longer)
    return np.array(all_tokens)


In [None]:
input_train = convert_lines(comments_train.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
fakness_labels_train = torch.tensor(fakness_labels_train)
female_labels_train = torch.tensor(identity_labels_train)

input_val = convert_lines(comments_val.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
fakness_labels_val = torch.tensor(fakness_labels_val)
female_labels_val = torch.tensor(identity_labels_val)


In [None]:
print(torch.sum(fakness_labels_train).data)
print(torch.sum(female_labels_train).data)

print(torch.sum(fakness_labels_val).data)
print(torch.sum(female_labels_val).data)


In [None]:
X_train = torch.utils.data.TensorDataset(torch.tensor(input_train, dtype=torch.long), fakness_labels_train, female_labels_train)
train_loader = torch.utils.data.DataLoader(X_train, batch_size=32, shuffle=True)

X_val = torch.utils.data.TensorDataset(torch.tensor(input_val, dtype=torch.long), fakness_labels_val, female_labels_val)
val_loader = torch.utils.data.DataLoader(X_val, batch_size=32, shuffle=True)


In [None]:
def get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

  def get_fake_rates(y_pred, protected_labels, non_protected_labels, thres):
    protected_ops = y_pred[protected_labels == 1]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[non_protected_labels == 1]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)

  def get_true_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels == 1)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 1)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)


  def get_false_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels ==0)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 0)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)

  def demographic_parity(y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[protected_labels == 1]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[non_protected_labels == 1]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)
  def true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels == 1)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 1)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)


  def false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels ==0)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 0)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)

  def equalized_odds(actual_labels, y_pred, protected_labels, non_protected_labels, thres):
    return true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres) + false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

  female_tox_rate, nf_tox_rate = get_fake_rates(y_pred, protected_labels, non_protected_labels, thres)
  female_tp_rate, nf_tp_rate = get_true_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  female_fp_rate, nf_fp_rate = get_false_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  demo_parity = demographic_parity(y_pred, protected_labels, non_protected_labels, thres)
  tp_parity = true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  fp_parity = false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  equ_odds = equalized_odds(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

  return female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds



In [None]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072,
        hidden_dropout_prob=0.1)

class Classifier(nn.Module):
    def __init__(self, fakness_labels = 2):
        super(Classifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.c1 = nn.Linear(config.hidden_size, 324)

        self.c3 = nn.Linear(324, fakness_labels)

        nn.init.xavier_normal_(self.c1.weight)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):



        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)


        classifier_prev_output = F.relu(self.c1(pooled_output))

        classifier_output = self.c3(classifier_prev_output)

        return classifier_output, classifier_prev_output

class Adversary(nn.Module):
    def __init__(self, identity_labels = 2):
        super(Adversary, self).__init__()

        self.a1 = nn.Linear(324,120)
        self.a2 = nn.Linear(120, identity_labels)

        nn.init.xavier_normal_(self.a1.weight)

    def forward(self, input_ids):
        adversary = F.relu(self.a1(input_ids))
        adversary_output = self.a2(adversary)

        return adversary_output


In [None]:
def conduct_validation(net, data_loader, adv = False):

    eval_loss, eval_accuracy, eval_precision, eval_recall, eval_f1 = 0, 0, 0, 0, 0
    nb_eval_steps = 0

    predictions_net = np.empty((0,))
    truths = np.empty((0,))
    identities = np.empty((0,))
    correct_net = 0
    total = 0

    net.eval()
    with torch.no_grad():
      for index, data in enumerate(data_loader):

        text, fake_truth, female_truth = data

        text = text.to(device)
        fake_truth = fake_truth.to(device)
        female_truth = female_truth.to(device)

        if adv:
          net_outputs, net_prev_outputs = net(text)
        else:
          net_outputs = net(text)
        _, net_predicted = torch.max(net_outputs.data, 1)

        batch_size = fake_truth.size(0)
        total += batch_size
        correct_net_batch = (net_predicted == fake_truth).sum().item()
        correct_net += correct_net_batch


        predictions_net = np.concatenate((predictions_net, net_predicted.cpu().numpy()))
        truths = np.concatenate((truths, fake_truth.cpu().numpy()))
        identities = np.concatenate((identities, female_truth.cpu().numpy()))

        pred = net_predicted.detach().cpu().numpy()
        label_ids = fake_truth.to('cpu').numpy()

        tmp_eval_accuracy, tmp_eval_precision, temp_eval_recall, tmp_eval_f1 = get_metrics(label_ids, pred)

        eval_accuracy += tmp_eval_accuracy
        eval_precision += tmp_eval_precision
        eval_recall += temp_eval_recall
        eval_f1 += tmp_eval_f1
        nb_eval_steps += 1

    f1_score = eval_f1/nb_eval_steps
    prec_score = eval_precision/nb_eval_steps
    recall_score = eval_recall/nb_eval_steps
    acc_score = eval_accuracy/nb_eval_steps

    print("F1 Score: ", f1_score)
    print("Precision Score: ", prec_score)
    print("Recall Score: ", recall_score)
    print("Acc Score: ", acc_score, "\n\n")

    net.train()

    return (predictions_net, truths, identities, acc_score)

In [None]:
def pretrain_classifier(clf, optimizer_clf, train_loader, loss_criterion, epochs):

  pretrain_classifier_loss = 0
  steps = 0

  for epoch in range(epochs):

    print("Epoch: ", epoch + 1)
    epoch_loss = 0
    epoch_batches = 0

    for i, data in enumerate(train_loader):
        inputs, fake_true, female_true = data
        inputs = inputs.to(device)

        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_clf.zero_grad()

        classifier_output, _ = clf(inputs)
        classifier_loss = loss_criterion(classifier_output, fake_true)
        classifier_loss.backward()
        optimizer_clf.step()
        pretrain_classifier_loss += classifier_loss.item()
        epoch_loss += classifier_loss.item()
        epoch_batches += 1
        steps += 1

    print("Average Pretrain Classifier epoch loss: ", epoch_loss/epoch_batches)
  print("Average Pretrain Classifier batch loss: ", pretrain_classifier_loss/steps)

  return clf

In [None]:
def pretrain_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs):

  pretrain_adversary_loss = 0
  steps = 0

  for epoch in range(epochs):

    print("Epoch: ", epoch + 1)
    epoch_loss = 0
    epoch_batches = 0
    for i, data in enumerate(train_loader):

        inputs, fake_true, female_true = data
        inputs = inputs.to(device)
        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_adv.zero_grad()

        _, classifier_prev_output = clf(inputs)
        adversary_output = adv(classifier_prev_output)
        adversary_loss = loss_criterion(adversary_output, female_true)
        adversary_loss.backward()
        optimizer_adv.step()
        pretrain_adversary_loss += adversary_loss.item()
        epoch_loss += adversary_loss.item()
        epoch_batches += 1
        steps += 1

    print("Average Pretrain Adversary epoch loss: ", epoch_loss/epoch_batches)
  print("Average Pretrain Adversary batch loss: ", pretrain_adversary_loss/steps)

  return adv

In [None]:
def train_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs=1):

  adv_loss = 0
  steps = 0

  for epoch in range(epochs):
    for i, data in enumerate(train_loader):
        inputs, fake_true, female_true = data
        inputs = inputs.to(device)

        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_adv.zero_grad()

        classifier_output, classifier_prev_output = clf(inputs)
        adversary_output = adv(classifier_prev_output)
        adversary_loss = loss_criterion(adversary_output, female_true)
        adversary_loss.backward()
        optimizer_adv.step()
        adv_loss += adversary_loss.item()
        steps += 1

  print("Average Adversary batch loss: ", adv_loss/steps)

  return adv

In [None]:
def train_classifier(clf, optimizer_clf, adv, train_loader, loss_criterion, lbda):

  for i, data in enumerate(train_loader):

      inputs, fake_true, female_true = data
      inputs = inputs.to(device)

      fake_true = fake_true.to(device)
      female_true = female_true.to(device)



      optimizer_clf.zero_grad()

      classifier_output, classifier_prev_output = clf(inputs)
      adversary_output = adv(classifier_prev_output)
      adversary_loss = loss_criterion(adversary_output, female_true)
      classifier_loss = loss_criterion(classifier_output, fake_true)
      total_classifier_loss = classifier_loss - lbda * adversary_loss
      total_classifier_loss.backward()

      optimizer_clf.step()

      print("Adversary Mini-Batch loss: ", adversary_loss.item())
      print("Classifier Mini-Batch loss: ", classifier_loss.item())
      print("Total Mini-Batch loss: ", total_classifier_loss.item())

      break

  return clf

In [None]:
lambda_params = [0.1, 0.2, 0.3, 0.5 1,2,3,5,7,10]

lbda_train_accs = []
lbda_valid_accs = []
protected_fake_rates = []
unprotected_fake_rates = []
protected_tp_rates = []
unprotected_tp_rates = []
protected_fp_rates = []
unprotected_fp_rates = []
demo_parity_scores = []
tp_parity_scores = []
fp_parity_scores = []
equ_odds_scores = []

for lbda in lambda_params:
  print('Lambda: ' + str(lbda))

  clf = Classifier(fakness_labels = 2)
  adv = Adversary(identity_labels = 2)

  loss_criterion = torch.nn.CrossEntropyLoss()


  optimizer_adv = optim.Adam(adv.parameters(), lr=0.001)

  lrlast = .001
  lrmain = .00001
  optimizer_clf = optim.Adam(
      [
          {"params":clf.bert.parameters(),"lr": lrmain},
          {"params":clf.c1.parameters(), "lr": lrlast},

      {"params":clf.c3.parameters(), "lr": lrlast}
    ])

  clf.to(device)
  adv.to(device)



  for param in adv.parameters():
    param.requires_grad = False

  clf = pretrain_classifier(clf, optimizer_clf, train_loader, loss_criterion, 3)

  for param in adv.parameters():
    param.requires_grad = True



  for param in clf.parameters():
    param.requires_grad = False

  adv = pretrain_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, 3)

  for param in clf.parameters():
    param.requires_grad = True



  train_accs = []
  valid_accs = []
  iterations = 20

  for iteration in range(iterations):
      print("Iteration: ", iteration)



      for param in clf.parameters():
        param.requires_grad = False

      adv = train_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs=1)

      for param in clf.parameters():
        param.requires_grad = True



      for param in adv.parameters():
        param.requires_grad = False

      clf = train_classifier(clf, optimizer_clf, adv, train_loader, loss_criterion, lbda)

      for param in adv.parameters():
        param.requires_grad = True

      if (iteration + 1) % 2 == 0:

        print('Training metrics:')
        y_pred, actual_labels, protected_labels, acc_score = conduct_validation(clf, train_loader, adv = True)
        train_accs.append(acc_score)

        print("\n")
        print("Fairness Metrics on Train:")
        non_protected_labels = np.asarray(get_unprotected_class(protected_labels))
        thres = 0.5
        female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds =\
        get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

        print("Fake Prediction Rates: ", "Female -", female_tox_rate, "Non-Female - ", nf_tox_rate)
        print("True Positive Prediction Rates: ", "Female -", female_tp_rate, "Non-Female - ", nf_tp_rate)
        print("False Positive Prediction Rates: ", "Female -", female_fp_rate, "Non-Female - ", nf_fp_rate)
        print("Demographic Parity: ", demo_parity)
        print("True Positive Parity: ", tp_parity)
        print("False Positive Parity: ", fp_parity)
        print("Equalized Odds: ", equ_odds)
        print("\n")
        print('Validation metrics:')
        y_pred, actual_labels, protected_labels, acc_score = conduct_validation(clf, val_loader, adv = True)
        valid_accs.append(acc_score)

        print("\n")
        print("Fairness Metrics on Validation:")
        non_protected_labels = np.asarray(get_unprotected_class(protected_labels))
        thres = 0.5
        female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds =\
        get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

        print("Fake Prediction Rates: ", "Female -", female_tox_rate, "Non-Female - ", nf_tox_rate)
        print("True Positive Prediction Rates: ", "Female -", female_tp_rate, "Non-Female - ", nf_tp_rate)
        print("False Positive Prediction Rates: ", "Female -", female_fp_rate, "Non-Female - ", nf_fp_rate)
        print("Demographic Parity: ", demo_parity)
        print("True Positive Parity: ", tp_parity)
        print("False Positive Parity: ", fp_parity)
        print("Equalized Odds: ", equ_odds)
        print("\n\n\n__________________")

        if iteration == iterations -1:
          protected_fake_rates.append(female_tox_rate)
          unprotected_fake_rates.append(nf_tox_rate)
          protected_tp_rates.append(female_tp_rate)
          unprotected_tp_rates.append(nf_tp_rate)
          protected_fp_rates.append(female_fp_rate)
          unprotected_fp_rates.append(nf_fp_rate)
          demo_parity_scores.append(demo_parity)
          tp_parity_scores.append(tp_parity)
          fp_parity_scores.append(fp_parity)
          equ_odds_scores.append(equ_odds)

  lbda_train_accs.append(train_accs)
  lbda_valid_accs.append(valid_accs)



print('Finished Training')

In [None]:
torch.save(clf.state_dict(), "/content/drive/MyDrive/Gender Bias/saved_models/SC_Classifier_Final_covid")
torch.save(adv.state_dict(), "/content/drive/MyDrive/Gender Bias/saved_models/SC_Adversary_Final_covid")

# ISOT

In [None]:
df = pd.read_csv ('/content/drive/MyDrive/Gender Bias/isot.csv')

In [None]:
label_mapping = {'fake': 1, 'real': 0}
df['label'] = df['label'].map(label_mapping)

In [None]:
df['female'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)
df['male'] = df['gender'].apply(lambda x: 1 if x == 'male' else 0)

In [None]:
df_train_total , df_val_total = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
df_train_total

In [None]:
df_train = df_train_total
df_val = df_val_total

In [None]:
comments_train = df_train.tweet
comments_val = df_val.tweet

In [None]:
def extract_female_gender(x):
  if np.isnan(x.female) or x.female < 0.5:
      return 0
  else:
    return 1

In [None]:
def get_unprotected_class(list_of_protected):
  new = [1 if i == 0 else 0 for i in list_of_protected]
  return new

In [None]:
def get_metrics(labels, preds):
  pred_flat = preds.flatten()
  labels_flat = labels.flatten()

  acc = accuracy_score(labels_flat, pred_flat)
  pre = precision_score(labels_flat, pred_flat)
  rec = recall_score(labels_flat, pred_flat)
  f1 = f1_score(labels_flat, pred_flat, average="weighted")

  return acc, pre, rec, f1

In [None]:
fakness_labels_train = list(df_train.label.apply(lambda x: 1 if x >= 0.5 else 0))
identity_labels_train = list(df_train.apply(extract_female_gender, axis = 1))
fakness_labels_val = list(df_val.label.apply(lambda x: 1 if x >= 0.5 else 0))
identity_labels_val = list(df_val.apply(extract_female_gender, axis = 1))
unprotected_labels_train = get_unprotected_class(identity_labels_train)
unprotected_labels_val = get_unprotected_class(identity_labels_val)

In [None]:
print(len(comments_train), len(fakness_labels_train))
print(comments_train[:10])
print(fakness_labels_train[:10])
print(identity_labels_train[:10])

In [None]:
MAX_SEQUENCE_LENGTH = 128
SEED = 519
BATCH_SIZE = 32
BERT_MODEL_PATH = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)

In [None]:
def convert_lines(example, max_seq_length,tokenizer):
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print("Tokens longer than max_length: ", longer)
    return np.array(all_tokens)


In [None]:
input_train = convert_lines(comments_train.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
fakness_labels_train = torch.tensor(fakness_labels_train)
female_labels_train = torch.tensor(identity_labels_train)

input_val = convert_lines(comments_val.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
fakness_labels_val = torch.tensor(fakness_labels_val)
female_labels_val = torch.tensor(identity_labels_val)


In [None]:
print(torch.sum(fakness_labels_train).data)
print(torch.sum(female_labels_train).data)

print(torch.sum(fakness_labels_val).data)
print(torch.sum(female_labels_val).data)


In [None]:
X_train = torch.utils.data.TensorDataset(torch.tensor(input_train, dtype=torch.long), fakness_labels_train, female_labels_train)
train_loader = torch.utils.data.DataLoader(X_train, batch_size=32, shuffle=True)

X_val = torch.utils.data.TensorDataset(torch.tensor(input_val, dtype=torch.long), fakness_labels_val, female_labels_val)
val_loader = torch.utils.data.DataLoader(X_val, batch_size=32, shuffle=True)


In [None]:
def get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

  def get_fake_rates(y_pred, protected_labels, non_protected_labels, thres):
    protected_ops = y_pred[protected_labels == 1]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[non_protected_labels == 1]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)

  def get_true_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels == 1)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 1)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)


  def get_false_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels ==0)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 0)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return np.round(protected_prob, 2), np.round(non_protected_prob, 2)

  def demographic_parity(y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[protected_labels == 1]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[non_protected_labels == 1]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)

  def true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels == 1)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 1)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)

  def false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres):

    protected_ops = y_pred[np.bitwise_and(protected_labels == 1, actual_labels ==0)]
    protected_prob = sum(protected_ops)/len(protected_ops)

    non_protected_ops = y_pred[np.bitwise_and(non_protected_labels == 1, actual_labels == 0)]
    non_protected_prob = sum(non_protected_ops)/len(non_protected_ops)

    return abs(protected_prob - non_protected_prob)


  def equalized_odds(actual_labels, y_pred, protected_labels, non_protected_labels, thres):
    return true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres) + false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

  female_tox_rate, nf_tox_rate = get_fake_rates(y_pred, protected_labels, non_protected_labels, thres)
  female_tp_rate, nf_tp_rate = get_true_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  female_fp_rate, nf_fp_rate = get_false_positive_rates(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  demo_parity = demographic_parity(y_pred, protected_labels, non_protected_labels, thres)
  tp_parity = true_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  fp_parity = false_positive_parity(actual_labels, y_pred, protected_labels, non_protected_labels, thres)
  equ_odds = equalized_odds(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

  return female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds



In [None]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072,
        hidden_dropout_prob=0.1)

class Classifier(nn.Module):
    def __init__(self, fakness_labels = 2):
        super(Classifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.c1 = nn.Linear(config.hidden_size, 324)
        self.c3 = nn.Linear(324, fakness_labels)

        nn.init.xavier_normal_(self.c1.weight)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):

        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)


        classifier_prev_output = F.relu(self.c1(pooled_output))

        classifier_output = self.c3(classifier_prev_output)

        return classifier_output, classifier_prev_output

class Adversary(nn.Module):
    def __init__(self, identity_labels = 2):
        super(Adversary, self).__init__()

        self.a1 = nn.Linear(324,120)
        self.a2 = nn.Linear(120, identity_labels)

        nn.init.xavier_normal_(self.a1.weight)

    def forward(self, input_ids):
        adversary = F.relu(self.a1(input_ids))
        adversary_output = self.a2(adversary)

        return adversary_output


In [None]:
def conduct_validation(net, data_loader, adv = False):

    eval_loss, eval_accuracy, eval_precision, eval_recall, eval_f1 = 0, 0, 0, 0, 0
    nb_eval_steps = 0

    predictions_net = np.empty((0,))
    truths = np.empty((0,))
    identities = np.empty((0,))
    correct_net = 0
    total = 0

    net.eval()
    with torch.no_grad():
      for index, data in enumerate(data_loader):

        text, fake_truth, female_truth = data

        text = text.to(device)
        fake_truth = fake_truth.to(device)
        female_truth = female_truth.to(device)

        if adv:
          net_outputs, net_prev_outputs = net(text)
        else:
          net_outputs = net(text)
        _, net_predicted = torch.max(net_outputs.data, 1)

        batch_size = fake_truth.size(0)
        total += batch_size
        correct_net_batch = (net_predicted == fake_truth).sum().item()
        correct_net += correct_net_batch


        predictions_net = np.concatenate((predictions_net, net_predicted.cpu().numpy()))
        truths = np.concatenate((truths, fake_truth.cpu().numpy()))
        identities = np.concatenate((identities, female_truth.cpu().numpy()))

        pred = net_predicted.detach().cpu().numpy()
        label_ids = fake_truth.to('cpu').numpy()

        tmp_eval_accuracy, tmp_eval_precision, temp_eval_recall, tmp_eval_f1 = get_metrics(label_ids, pred)

        eval_accuracy += tmp_eval_accuracy
        eval_precision += tmp_eval_precision
        eval_recall += temp_eval_recall
        eval_f1 += tmp_eval_f1
        nb_eval_steps += 1

    f1_score = eval_f1/nb_eval_steps
    prec_score = eval_precision/nb_eval_steps
    recall_score = eval_recall/nb_eval_steps
    acc_score = eval_accuracy/nb_eval_steps

    print("F1 Score: ", f1_score)
    print("Precision Score: ", prec_score)
    print("Recall Score: ", recall_score)
    print("Acc Score: ", acc_score, "\n\n")

    net.train()

    return (predictions_net, truths, identities, acc_score)

In [None]:
def pretrain_classifier(clf, optimizer_clf, train_loader, loss_criterion, epochs):

  pretrain_classifier_loss = 0
  steps = 0

  for epoch in range(epochs):

    print("Epoch: ", epoch + 1)
    epoch_loss = 0
    epoch_batches = 0

    for i, data in enumerate(train_loader):

        inputs, fake_true, female_true = data
        inputs = inputs.to(device)

        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_clf.zero_grad()

        classifier_output, _ = clf(inputs)
        classifier_loss = loss_criterion(classifier_output, fake_true)
        classifier_loss.backward()
        optimizer_clf.step()
        pretrain_classifier_loss += classifier_loss.item()
        epoch_loss += classifier_loss.item()
        epoch_batches += 1
        steps += 1

    print("Average Pretrain Classifier epoch loss: ", epoch_loss/epoch_batches)
  print("Average Pretrain Classifier batch loss: ", pretrain_classifier_loss/steps)

  return clf

In [None]:
def pretrain_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs):

  pretrain_adversary_loss = 0
  steps = 0

  for epoch in range(epochs):

    print("Epoch: ", epoch + 1)
    epoch_loss = 0
    epoch_batches = 0
    for i, data in enumerate(train_loader):

        inputs, fake_true, female_true = data
        inputs = inputs.to(device)

        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_adv.zero_grad()

        _, classifier_prev_output = clf(inputs)
        adversary_output = adv(classifier_prev_output)
        adversary_loss = loss_criterion(adversary_output, female_true)
        adversary_loss.backward()
        optimizer_adv.step()
        pretrain_adversary_loss += adversary_loss.item()
        epoch_loss += adversary_loss.item()
        epoch_batches += 1
        steps += 1

    print("Average Pretrain Adversary epoch loss: ", epoch_loss/epoch_batches)
  print("Average Pretrain Adversary batch loss: ", pretrain_adversary_loss/steps)

  return adv

In [None]:
def train_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs=1):

  adv_loss = 0
  steps = 0

  for epoch in range(epochs):
    for i, data in enumerate(train_loader):

        inputs, fake_true, female_true = data
        inputs = inputs.to(device)

        fake_true = fake_true.to(device)
        female_true = female_true.to(device)

        optimizer_adv.zero_grad()

        classifier_output, classifier_prev_output = clf(inputs)
        adversary_output = adv(classifier_prev_output)
        adversary_loss = loss_criterion(adversary_output, female_true)
        adversary_loss.backward()
        optimizer_adv.step()
        adv_loss += adversary_loss.item()
        steps += 1

  print("Average Adversary batch loss: ", adv_loss/steps)

  return adv

In [None]:
def train_classifier(clf, optimizer_clf, adv, train_loader, loss_criterion, lbda):

  for i, data in enumerate(train_loader):

      inputs, fake_true, female_true = data
      inputs = inputs.to(device)

      fake_true = fake_true.to(device)
      female_true = female_true.to(device)



      optimizer_clf.zero_grad()

      classifier_output, classifier_prev_output = clf(inputs)
      adversary_output = adv(classifier_prev_output)
      adversary_loss = loss_criterion(adversary_output, female_true)
      classifier_loss = loss_criterion(classifier_output, fake_true)
      total_classifier_loss = classifier_loss - lbda * adversary_loss
      total_classifier_loss.backward()

      optimizer_clf.step()

      print("Adversary Mini-Batch loss: ", adversary_loss.item())
      print("Classifier Mini-Batch loss: ", classifier_loss.item())
      print("Total Mini-Batch loss: ", total_classifier_loss.item())

      break

  return clf

In [None]:
lambda_params = [0.1, 0.2, 0.3, 0.5 1,2,3,5,7,10]

lbda_train_accs = []
lbda_valid_accs = []
protected_fake_rates = []
unprotected_fake_rates = []
protected_tp_rates = []
unprotected_tp_rates = []
protected_fp_rates = []
unprotected_fp_rates = []
demo_parity_scores = []
tp_parity_scores = []
fp_parity_scores = []
equ_odds_scores = []

for lbda in lambda_params:
  print('Lambda: ' + str(lbda))


  clf = Classifier(fakness_labels = 2)
  adv = Adversary(identity_labels = 2)

  loss_criterion = torch.nn.CrossEntropyLoss()


  optimizer_adv = optim.Adam(adv.parameters(), lr=0.001)

  lrlast = .001
  lrmain = .00001
  optimizer_clf = optim.Adam(
      [
          {"params":clf.bert.parameters(),"lr": lrmain},
          {"params":clf.c1.parameters(), "lr": lrlast},

      {"params":clf.c3.parameters(), "lr": lrlast}
    ])

  clf.to(device)
  adv.to(device)



  for param in adv.parameters():
    param.requires_grad = False

  clf = pretrain_classifier(clf, optimizer_clf, train_loader, loss_criterion, 3)

  for param in adv.parameters():
    param.requires_grad = True



  for param in clf.parameters():
    param.requires_grad = False

  adv = pretrain_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, 3)

  for param in clf.parameters():
    param.requires_grad = True



  train_accs = []
  valid_accs = []
  iterations = 20

  for iteration in range(iterations):
      print("Iteration: ", iteration)



      for param in clf.parameters():
        param.requires_grad = False

      adv = train_adversary(adv, clf, optimizer_adv, train_loader, loss_criterion, epochs=1)

      for param in clf.parameters():
        param.requires_grad = True



      for param in adv.parameters():
        param.requires_grad = False

      clf = train_classifier(clf, optimizer_clf, adv, train_loader, loss_criterion, lbda)

      for param in adv.parameters():
        param.requires_grad = True

      if (iteration + 1) % 2 == 0:

        print('Training metrics:')
        y_pred, actual_labels, protected_labels, acc_score = conduct_validation(clf, train_loader, adv = True)
        train_accs.append(acc_score)

        print("\n")
        print("Fairness Metrics on Train:")
        non_protected_labels = np.asarray(get_unprotected_class(protected_labels))
        thres = 0.5
        female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds =\
        get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

        print("Fake Prediction Rates: ", "Female -", female_tox_rate, "Non-Female - ", nf_tox_rate)
        print("True Positive Prediction Rates: ", "Female -", female_tp_rate, "Non-Female - ", nf_tp_rate)
        print("False Positive Prediction Rates: ", "Female -", female_fp_rate, "Non-Female - ", nf_fp_rate)
        print("Demographic Parity: ", demo_parity)
        print("True Positive Parity: ", tp_parity)
        print("False Positive Parity: ", fp_parity)
        print("Equalized Odds: ", equ_odds)
        print("\n")
        print('Validation metrics:')
        y_pred, actual_labels, protected_labels, acc_score = conduct_validation(clf, val_loader, adv = True)
        valid_accs.append(acc_score)

        print("\n")
        print("Fairness Metrics on Validation:")
        non_protected_labels = np.asarray(get_unprotected_class(protected_labels))
        thres = 0.5
        female_tox_rate, nf_tox_rate, female_tp_rate, nf_tp_rate, female_fp_rate, nf_fp_rate, demo_parity, tp_parity, fp_parity, equ_odds =\
        get_fairness_metrics(actual_labels, y_pred, protected_labels, non_protected_labels, thres)

        print("Fake Prediction Rates: ", "Female -", female_tox_rate, "Non-Female - ", nf_tox_rate)
        print("True Positive Prediction Rates: ", "Female -", female_tp_rate, "Non-Female - ", nf_tp_rate)
        print("False Positive Prediction Rates: ", "Female -", female_fp_rate, "Non-Female - ", nf_fp_rate)
        print("Demographic Parity: ", demo_parity)
        print("True Positive Parity: ", tp_parity)
        print("False Positive Parity: ", fp_parity)
        print("Equalized Odds: ", equ_odds)
        print("\n\n\n__________________")

        if iteration == iterations -1:
          protected_fake_rates.append(female_tox_rate)
          unprotected_fake_rates.append(nf_tox_rate)
          protected_tp_rates.append(female_tp_rate)
          unprotected_tp_rates.append(nf_tp_rate)
          protected_fp_rates.append(female_fp_rate)
          unprotected_fp_rates.append(nf_fp_rate)
          demo_parity_scores.append(demo_parity)
          tp_parity_scores.append(tp_parity)
          fp_parity_scores.append(fp_parity)
          equ_odds_scores.append(equ_odds)

  lbda_train_accs.append(train_accs)
  lbda_valid_accs.append(valid_accs)


print('Finished Training')

In [None]:
torch.save(clf.state_dict(), "/content/drive/MyDrive/Gender Bias/saved_models/SC_Classifier_Final_ISOT")
torch.save(adv.state_dict(), "/content/drive/MyDrive/Gender Bias/saved_models/SC_Adversary_Final_ISOT")