In [None]:
import os
import re
import time
import ast
import warnings
import math
import copy
import matplotlib.pyplot as plt
from xgboost import plot_importance
import seaborn as sns

# data
import pandas as pd
import numpy as np
import csv

# ML
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics

!pip install scikit-plot
import scikitplot as skplt
import xgboost 

In [None]:
PATH = "/content/drive/MyDrive/Fairness/data/MBIC.xlsx"
df = pd.read_excel(PATH)
df.rename(columns={'sentence': 'sentence', 'label_bias': 'Label_bias'}, inplace=True)
df.head()

In [None]:
print('Number of missing values:')
mis = (dt.isnull().sum()).to_frame()
for i, row in mis.iterrows():
    if row[0] > 0:
        print(i, row[0])

dt = dt.dropna(subset=['tfidf_art'])

In [None]:
dt.head()

In [None]:
len(dt['sentence'].unique())

In [None]:
x = dt.drop(['label4', 'sentence'], 1)
y4 = dt[['label4']]
sentence = dt[['sentence']]

In [None]:
train_features, test_features, train_labels, test_labels, train_sentences, test_sentences = train_test_split(x, y4, sentence,
                                                                                                             test_size = 0.10, random_state = 42)
train_features1, val_features, train_labels1, val_labels, train_sentences1, val_sentences = train_test_split(train_features, train_labels, train_sentences,
                                                                                                             test_size = 0.10, random_state = 42)

In [None]:
feature_names = list(x.columns)

In [None]:
dtrain = xgboost.DMatrix(train_features, label=train_labels, feature_names=feature_names)
dtest = xgboost.DMatrix(test_features, label=test_labels, feature_names=feature_names)
dtrain1 = xgboost.DMatrix(train_features1, label=train_labels1, feature_names=feature_names)
dval = xgboost.DMatrix(val_features, label=val_labels, feature_names=feature_names)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
print('Training Features for final model Shape:', train_features1.shape)
print('Training Labels for final model Shape:', train_labels1.shape)
print('Validation Features Shape:', val_features.shape)
print('Validation Labels Shape:', val_labels.shape)

##  Baselines

### B1: Random guesser

In [None]:
b1_pred = pd.Series(np.random.randint(2, size=len(test_features)))

In [None]:
print('Performance of b1, test:')
print('F1:', round(metrics.f1_score(test_labels,b1_pred),2))
print('Precision:', round(metrics.precision_score(test_labels,b1_pred),2))
print('Recall:', round(metrics.recall_score(test_labels,b1_pred),2))
print('AUC:', round(metrics.roc_auc_score(test_labels,b1_pred),2))
print('Accuracy:', round(metrics.accuracy_score(test_labels,b1_pred),2))
print('Confusion matrix:\n', 
      metrics.confusion_matrix(test_labels,b1_pred).transpose())

### B2: Negative sentiment lexicon

In [None]:
b2_pred = test_features.negative_conc

In [None]:
print('Performance of b2, test:')
print('F1:', round(metrics.f1_score(test_labels,b2_pred),2))
print('Precision:', round(metrics.precision_score(test_labels,b2_pred),2))
print('Recall:', round(metrics.recall_score(test_labels,b2_pred),2))
print('AUC:', round(metrics.roc_auc_score(test_labels,b2_pred),2))
print('Accuracy:', round(metrics.accuracy_score(test_labels,b2_pred),2))
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels,b2_pred).transpose())

In [None]:
cf_matrix = metrics.confusion_matrix(test_labels,b2_pred)
TN = cf_matrix[0][0]
FN = cf_matrix[1][0]
TP = cf_matrix[1][1]
FP = cf_matrix[0][1]
fl = [TN, FN, FP, TP]
fig, ax = plt.subplots(figsize=(8,6))

sns.set(font_scale=2)
group_names = ['TN','FN','FP','TP']
group_counts = ["{0:0.0f}".format(value) for value in fl]
group_percentages = ["{0:.2%}".format(value) for value in
                     fl/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

sns.heatmap(cf_matrix, annot=labels, fmt='', cmap="BuPu")
plt.ylabel('Predicted', fontsize=20)
plt.xlabel('Actual', fontsize=20)
ax.set_ylim([0,2])
ax.invert_yaxis()

In [None]:
plt.rcParams['font.size'] = 16
plt.subplots(1, 1, figsize=(5,5))

rand_probs = [0 for _ in range(len(test_labels))]
rand_fpr, rand_tpr, thresholds = metrics.roc_curve(test_labels, rand_probs)

fpr, tpr, thresholds = metrics.roc_curve(test_labels, b2_pred,
                                         pos_label=1)
plt.plot(fpr, tpr, linewidth=3, color='purple')
plt.plot(rand_fpr, rand_tpr, linestyle='--', linewidth=3, color='lightblue')
plt.title("ROC Curve", fontsize=20)
plt.xlabel("FP Rate", fontsize=20)
plt.ylabel("TP Rate", fontsize=20)
plt.show()

### B3: Negative and positive sentiment lexicon

In [None]:
b3_pred = test_features.apply(lambda row: 1 if row.negative_conc==1 or row.positive_conc==1 else 0, axis=1)

In [None]:
print('Performance of b3, test:')
print('F1:', round(metrics.f1_score(test_labels,b3_pred),2))
print('Precision:', round(metrics.precision_score(test_labels,b3_pred),2))
print('Recall:', round(metrics.recall_score(test_labels,b3_pred),2))
print('AUC:', round(metrics.roc_auc_score(test_labels,b3_pred),2))
print('Accuracy:', round(metrics.accuracy_score(test_labels,b3_pred),2))
print('Confusion matrix:\n', 
      metrics.confusion_matrix(test_labels,b3_pred).transpose())

### B4: Semi-automated bias lexicon

In [None]:
b4_pred = test_features.bias_lexicon

In [None]:
print('Performance of b4, test:')
print('F1:', round(metrics.f1_score(test_labels,b4_pred),2))
print('Precision:', round(metrics.precision_score(test_labels,b4_pred),2))
print('Recall:', round(metrics.recall_score(test_labels,b4_pred),2))
print('AUC:', round(metrics.roc_auc_score(test_labels,b4_pred),2))
print('Accuracy:', round(metrics.accuracy_score(test_labels,b4_pred),2))
print('Confusion matrix:\n', 
      metrics.confusion_matrix(test_labels,b4_pred).transpose())

## 3 XGBoost optimization

In [None]:
print('Biased words:',round(len(y4[y4['label4']==1])/len(y4)*100,0),'%')
print('Neutral words:',round(len(y4[y4['label4']==0])/len(y4)*100,0),'%')

In [None]:
def f1_eval(predt: np.ndarray, dtrain: xgboost.DMatrix):
    y = dtrain.get_label()
    predt_binary = np.where(predt > 0.5, 1, 0)
    return "F1_score", metrics.f1_score(y_true=y, y_pred=predt_binary)

### 3.1 Hyper-parameters tuning

In [None]:
scale_pos_weight = len(train_labels[train_labels['label4']==0])/len(train_labels[train_labels['label4']==1])
scale_pos_weight

In [None]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':"binary:logistic",
    'disable_default_eval_metric': 1,
    'seed': 42,
    'tree_method': 'hist',
    'scale_pos_weight': scale_pos_weight
}

num_boost_round = 999

In [None]:
def f1_eval(predt: np.ndarray, dtrain: xgboost.DMatrix):
    y = dtrain.get_label()
    predt_binary = np.where(predt > 0.5, 1, 0)
    return "F1_score", metrics.f1_score(y_true=y, y_pred=predt_binary)

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,13,1)
    for min_child_weight in range(4,41,2)
]

In [None]:
start_time = time.time()
max_f1 = float(0)
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        feval=f1_eval,
        early_stopping_rounds = 10,
        maximize=True)
    
    mean_f1 = cv_results['test-F1_score-mean'].max()
    boost_rounds = cv_results['test-F1_score-mean'].argmax()
    print("\tF1 {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, F1: {}".format(best_params[0], best_params[1], max_f1))
end_time = time.time()
print('time:', end_time - start_time)

In [None]:
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(6,11,2)]
    for colsample in [i/10. for i in range(4,11,2)]
]

In [None]:
start_time = time.time()
max_f1 = float(0)
best_params = None

for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        feval=f1_eval,
        early_stopping_rounds = 10,
        maximize=True)
    
    mean_f1 = cv_results['test-F1_score-mean'].max()
    boost_rounds = cv_results['test-F1_score-mean'].argmax()
    print("\tF1 {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (subsample,colsample)
print("Best params: {}, {}, F1: {}".format(best_params[0], best_params[1], max_f1))
end_time = time.time()
print('time:', end_time - start_time)

In [None]:
params['subsample'] = best_params[0]
params['colsample_bytree'] = best_params[1]

In [None]:
start_time = time.time()
max_f1 = float(0)
best_params = None

for eta in [0.3, 0.2, 0.1, 0.01, 0.005]:
    print("CV with eta={}".format(eta))

    params['eta'] = eta

    cv_results = xgboost.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            feval=f1_eval,
            early_stopping_rounds = 10,
            maximize=True)

    mean_f1 = cv_results['test-F1_score-mean'].max()
    boost_rounds = cv_results['test-F1_score-mean'].argmax()
    print("\tF1 {} for {} rounds\n".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = eta
print("Best params: {}, F1: {}".format(best_params, max_f1))
end_time = time.time()
print('time:', end_time - start_time)

In [None]:
params['eta'] = best_params

### 3.2 Final model training and evaluation

In [None]:
params = {'max_depth': 10,
 'min_child_weight': 20,
 'eta': 0.3,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'objective': 'binary:logistic',
 'disable_default_eval_metric': 1,
 'seed': 42,
 'tree_method': 'hist',
 'scale_pos_weight': scale_pos_weight}
num_boost_round=999

In [None]:
xgboost_tuned_weighted = xgboost.train(
    params,
    dtrain1,
    num_boost_round=num_boost_round,
    evals=[(dtrain1, 'dtrain'), (dval, 'dval')],
    feval=f1_eval,
    early_stopping_rounds = 10,
    maximize=True)

In [None]:
xgboost_tuned_weighted_predt = np.where(xgboost_tuned_weighted.predict(dtest) > 0.5, 1, 0)
print('Performance of xgboost_tuned_weighted, test:')
print('F1:', round(metrics.f1_score(dtest.get_label(),xgboost_tuned_weighted_predt),2))
print('Precision:', round(metrics.precision_score(dtest.get_label(), xgboost_tuned_weighted_predt),2))
print('Recall:', round(metrics.recall_score(dtest.get_label(), xgboost_tuned_weighted_predt),2))
print('AUC:', round(metrics.roc_auc_score(dtest.get_label(), xgboost_tuned_weighted_predt),2))
print('Accuracy:', round(metrics.accuracy_score(dtest.get_label(), xgboost_tuned_weighted_predt),2))
print('Confusion matrix:\n', 
      metrics.confusion_matrix(dtest.get_label(), xgboost_tuned_weighted_predt).transpose())

In [None]:
test_sentences['pred'] = xgboost_tuned_weighted_predt

In [None]:
len(test_sentences['sentence'].unique())

In [None]:
test_sentences_hat = test_sentences.groupby('sentence', as_index=False).agg({'pred': 'sum'})
print('Number of predicted biased words per sentence:\n',test_sentences_hat['pred'].value_counts())

# rule: if one ore more bias words are in a sentence, the sentence will be labelled as biased
test_sentences_hat['Label_bias_hat'] = test_sentences_hat['pred'].apply(lambda x: 1 if x > 0 else 0)
print('Prediction on the sentence level:\n', test_sentences_hat['Label_bias_hat'].value_counts())

In [None]:
PATH_sg2 = "data/final_labels_SG2.xlsx"
sentences_ground_truth = pd.read_excel(PATH_sg2)
sentences_ground_truth.rename(columns={'text': 'sentence', 'label_bias': 'Label_bias'}, inplace=True)
sentences_ground_truth = sentences_ground_truth[sentences_ground_truth['Label_bias']!='No agreement']
sentences_ground_truth.replace(to_replace='Biased', value=1, inplace=True)
sentences_ground_truth.replace(to_replace='Non-biased', value=0, inplace=True)

sentences_ground_truth = test_sentences_hat.merge(right=sentences_ground_truth, how='left', on='sentence')
sentences_ground_truth = sentences_ground_truth[sentences_ground_truth['Label_bias'].isna() == False]
sentences_ground_truth 

In [None]:
print('Performance of xgboost_tuned_weighted, test:')
print('F1:', round(metrics.f1_score(sentences_ground_truth['Label_bias'],sentences_ground_truth['Label_bias_hat']),2))
print('Precision:', round(metrics.precision_score(sentences_ground_truth['Label_bias'],sentences_ground_truth['Label_bias_hat']),2))
print('Recall:', round(metrics.recall_score(sentences_ground_truth['Label_bias'],sentences_ground_truth['Label_bias_hat']),2))
print('AUC:', round(metrics.roc_auc_score(sentences_ground_truth['Label_bias'],sentences_ground_truth['Label_bias_hat']),2))
print('Accuracy:', round(metrics.accuracy_score(sentences_ground_truth['Label_bias'],sentences_ground_truth['Label_bias_hat']),2))
print('Confusion matrix:\n', 
      metrics.confusion_matrix(sentences_ground_truth['Label_bias'],sentences_ground_truth['Label_bias_hat']).transpose())

In [None]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
k = 1
val_acc = []
val_f1 = []
val_f1_micro = []
val_f1_wmacro = []
val_recall = []
val_precision = []

for train_index, val_index in skfold.split(x,y4):

  train_features = x.iloc[train_index]
  val_features = x.iloc[val_index]
  train_labels = y4.iloc[train_index]
  val_labels = y4.iloc[val_index]
  val_sentences = sentence.iloc[val_index]
  
  dtrain = xgboost.DMatrix(train_features, label=train_labels, feature_names=feature_names)
  dval = xgboost.DMatrix(val_features, label=val_labels, feature_names=feature_names)

  xgboost_tuned_weighted = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtrain, 'dtrain'), (dval, 'dval')],
    feval=f1_eval,
    early_stopping_rounds = 10,
    maximize=True)
  
  xgboost_tuned_weighted_predt = np.where(xgboost_tuned_weighted.predict(dval) > 0.5, 1, 0)
  
  val_sentences['pred'] = xgboost_tuned_weighted_predt
  val_sentences_hat = val_sentences.groupby('sentence', as_index=False).agg({'pred': 'sum'})
  val_sentences_hat['Label_bias_hat'] = val_sentences_hat['pred'].apply(lambda x: 1 if x > 0 else 0)

 
  PATH_sg2 = "data/final_labels_SG2.xlsx"
  sentences_ground_truth = pd.read_excel(PATH_sg2)
  sentences_ground_truth.rename(columns={'text': 'sentence', 'label_bias': 'Label_bias'}, inplace=True)
  sentences_ground_truth = sentences_ground_truth[sentences_ground_truth['Label_bias']!='No agreement']
  sentences_ground_truth.replace(to_replace='Biased', value=1, inplace=True)
  sentences_ground_truth.replace(to_replace='Non-biased', value=0, inplace=True)

  sentences_merged = val_sentences_hat.merge(right=sentences_ground_truth, how='left', on='sentence')
  sentences_merged = sentences_merged[sentences_merged['Label_bias']!=99]
  sentences_merged = sentences_merged[sentences_merged['Label_bias'].isna() == False]

  val_f1.append(metrics.f1_score(sentences_merged['Label_bias'],sentences_merged['Label_bias_hat']))
  val_f1_micro.append(metrics.f1_score(sentences_merged['Label_bias'],sentences_merged['Label_bias_hat'], average='micro'))
  val_f1_wmacro.append(metrics.f1_score(sentences_merged['Label_bias'],sentences_merged['Label_bias_hat'], average='weighted'))
  val_precision.append(metrics.precision_score(sentences_merged['Label_bias'],sentences_merged['Label_bias_hat']))
  val_recall.append(metrics.recall_score(sentences_merged['Label_bias'],sentences_merged['Label_bias_hat']))
  val_acc.append(metrics.accuracy_score(sentences_merged['Label_bias'],sentences_merged['Label_bias_hat']))

  k += 1

In [None]:
print('5-Fold CV Accuracy: {}'.format(np.mean(val_acc)))
print('5-Fold CV Precision: {}'.format(np.mean(val_precision)))
print('5-Fold CV Recall: {}'.format(np.mean(val_recall)))
print('5-Fold CV F1 Score: {}'.format(np.mean(val_f1)))
print('5-Fold CV Micro F1 Score: {}'.format(np.mean(val_f1_micro)))
print('5-Fold CV Weighted Macro F1 Score: {}'.format(np.mean(val_f1_wmacro)))