#Using BERT architecture to measure ESG qualities of a tweet.

### Importing Libraries

In [0]:
!git clone https://github.com/google-research/bert.git
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os 
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers
from keras.regularizers import l2
from keras.layers import Lambda
import math
import io
from matplotlib import pyplot
import copy

sess = tf.Session() #initializes a session

#set some parameters for model and tokens
bert_path = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
max_seq_length = 256
n_classes = 13

###Import Data from Local Drive

In [0]:
#Have 13 seperate targets
from google.colab import files
tweets = files.upload()



###Load Data
(This will get rid of neutral polarities.)

(Do not use this one for now it doesn't work with multiple targets)

In [0]:
import io
import random
#Try to load with pandas
df = pd.read_csv(io.BytesIO(tweets['Twitter_Data.csv']))
typeList = ['Business Ethics - S','Business Ethics - G', 'Anti-Competitive Practice', 'Corruption & Instability',
           'Privacy & Data Security', 'Discrimination','Toxic Emissions & Waste',  'Health & Demographic Risk',
           'Supply Chain Labour Standards or Labour Management', 'Carbon Emissions', 'Product Quality & Safety',
           'Polarity', 'Related To Company or Not'] #Thirteen classes

df = df.drop(columns = {'Student','company','ESG','keyword','keyword mapping'})
df = np.array(df).tolist() #for each element of df 0 is content, 1-10 are labels, second to last is polarity, last is related to company 

train_data = []
test_data = []
train_text = []
train_label = []
test_text = []
test_label = []

for twt in df:
  notempty = False
  for i in range(1,n_classes+1):
    if math.isnan(twt[i]):
      twt[i] = 0
    else:
      notempty = True
  twt[12] = ( twt[12] + 1 )/2

  
  if random.random()<0.8:
    train_data += [twt]
  else:       #add notempty==True to remove empty ones
    test_data +=[twt]

#Whats the issue with binary
for tweet in train_data:
  if not tweet[0]==None:
    train_text += [tweet[0]]
    train_label += [[tweet[1:n_classes+1]]]

    
for tweet in test_data:
  if not tweet[0]==None:
    test_text += [tweet[0]]
    test_label += [(tweet[1:n_classes+1])] #Temp measure, should be 1:13

###Get Info About the Data Set
(This function counts the number of tweets for each controversy)

In [0]:
def listtypes(train_label,test_label):
  traincount = []
  testcount = []
  for i in range(0,n_classes+2):
    traincount += [0]
    testcount += [0]
  
  for tnlabel in train_label:
    for k in range(0,n_classes):
      if tnlabel[0][k]==1:
        traincount[k]+=1
      if k==11 and tnlabel[0][k]==0:
        traincount[n_classes] += 1
      if k==11 and tnlabel[0][k]==0.5:
        traincount[n_classes+1] += 1
        
  
  for tstlabel in test_label:
    for k in range(0,n_classes):
      if tstlabel[k]==1:
        testcount[k]+=1
      if k==11 and tstlabel[k]==0:
        testcount[n_classes] +=1
      if k==11 and tstlabel[k]==0.5:
        testcount[n_classes+1] += 1
  
  print("For Training Data:")
  for i in range(0,11):
    print("For controversy type: " + typeList[i] + ' There are ' + str(traincount[i]) + ' tweets.')
  print("Number of tweets with positive polarity: " + str(traincount[11]))
  print('Number of tweets with negative polarity: ' + str(traincount[13]))
  print('Number of tweets with neutral polarity: ' + str(traincount[14]))
  print('Number of tweets that are related to company: ' + str(traincount[12]))
  
  print("\nFor Testing Data:")
  for i in range(0,10):
    print("For controversy type: " + typeList[i] + ' There are ' + str(testcount[i]) + ' tweets.')
  print("Number of tweets with positive polarity: " + str(testcount[11]))
  print('Number of tweets with negative polarity: ' + str(testcount[13]))
  print('Number of tweets with neutral polarity: ' + str(testcount[14]))
  print('Number of tweets that are related to company: ' + str(testcount[12]))
  return traincount,testcount

_ = listtypes(train_label,test_label)

###Tokenizing Data

In [0]:
#Seperate the data into words

class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, n_classes),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        text = str(text)
        InputExamples.append(
            InputExample(guid=None, text_a=" " + text, text_b=None, label=label) # Used to be text_a = " ".join(text) not sure if this option still works
        )
    return InputExamples

# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
test_examples = convert_text_to_examples(test_text, test_label)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels 
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids, test_input_masks, test_segment_ids, test_labels
) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

###Defining the BERT class

In [0]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="mean",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

###Building the model

In [0]:
# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=3, pooling="first")(bert_inputs)
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    extradense = tf.keras.layers.Dense(64,activation = 'relu')(dense)
    pred = tf.keras.layers.Dense(n_classes, activation='sigmoid')(extradense)
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  #Fix this
    model.summary()
    
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)
  

In [0]:
model = build_model(max_seq_length)

initialize_vars(sess)

#Add in early stopping
es = EarlyStopping(monitor = 'val_loss')
cb_list=[es]

ESGmodelhistory = model.fit(
  [train_input_ids, train_input_masks, train_segment_ids],
  train_labels,
  validation_data = ([test_input_ids, test_input_masks, test_segment_ids], test_labels),
  epochs = 5,
  batch_size = 32,
  callbacks = cb_list
)

pyplot.plot(ESGmodelhistory.history['loss'], label='train')
pyplot.plot(ESGmodelhistory.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

###Predict and Evaluate



In [0]:
pred = model.predict([test_input_ids, test_input_masks, test_segment_ids])

####Find Precision and Recall for a Single Threshold

In [0]:
for prediction in pred:
  for i in range(0,n_classes):
    if prediction[i] > 0.1 and i != n_classes-2: #n_classes-2 is the polarity label
      prediction[i] = 1
    elif prediction[i] > 0.33 and i == n_classes-2:
      prediction[i] = 1
    else:
      prediction[i] = 0

measures = [] #this will hode tp,fp,tn,fn values
for i in range(0,n_classes):
  measures += [[0,0,0,0]]  #[tp,fp,tn,fn] for each class
      
for prediction,label in zip(pred,test_label):
  for i in range(0,n_classes):
    if prediction[i] == 1 and label[i] == 1:
      measures[i][0] += 1
      
    if prediction[i] == 1 and label[i] == 0:
      measures[i][1] += 1
      
    if prediction[i] == 0 and label[i] == 0:
      measures[i][2] += 1
      
    if prediction[i] == 0 and label[i] == 1:
      measures[i][3] += 1

precisions = []
recalls = []
accuracies = []

for measurement in measures:
  precisions += [(measurement[0]/max(1,(measurement[0] + measurement[1])))] #tp/(tp+fp)
  recalls += [(measurement[0]/max(1,(measurement[0] + measurement[3])))]
  accuracies += [(measurement[0]+measurement[2])/len(pred)]
print(precisions)
print(recalls)
print(accuracies)

####Create ROC curves

In [0]:
print(pred[30][4])

In [0]:
def genROC(pred, labels, n, min_thresh, max_thresh, classid, n_classes): #This returns the tpr and fpr for n number of threshold values. !!It does not display the graph
  if classid > n_classes-1 or n<0:
    print('ERROR: classid is out of range')
    return False, False
  
  fprlist = [] #x-axis values
  tprlist = [] #y-axis values
  
  #Start by setting changing the curve based on the threshold
  for thresh in range(0,n):
    thresh = (max_thresh - min_thresh) * float(thresh)/n
    thresh = thresh + min_thresh
    newpred = copy.deepcopy(pred)
    for p in newpred:
      if p[classid] > thresh:
        p[classid] = 1
      else:
        p[classid] = 0
    
    measures = [0,0,0,0] #[tp,fp,tn,fn]
    
    for p,label in zip(newpred,test_label):
      if p[classid] == 1 and label[classid] == 1:
        measures[0] += 1
      
      if p[classid] == 1 and label[classid] == 0:
        measures[1] += 1
      
      if p[classid] == 0 and label[classid] == 0:
        measures[2] += 1
      
      if p[classid] == 0 and label[classid] == 1:
        measures[3] += 1
        
    tpr = float(measures[0]) / (measures[0] + measures[3])  #tp/(tp+fn)
    fpr = float(measures[1]) / (measures[1] + measures[2])  #fp/(fp+tn)
    
    
    
    fprlist += [fpr]
    tprlist += [tpr]
    
  return fprlist,tprlist
    
fpr,tpr = genROC(pred,test_label, 2000, 0.0308, 0.0309, 4, n_classes)

pyplot.plot(fpr,tpr)
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('ROC curve')
pyplot.show()

####Create PR curves to find AUPRC

In [0]:
def genPRC(pred, labels, n, min_thresh, max_thresh, classid, n_classes): #This returns the tpr and fpr for n number of threshold values. !!It does not display the graph
  if classid > n_classes-1 or n<0:
    print('ERROR: classid is out of range')
    return False, False
  
  fprlist = [] #x-axis values
  tprlist = [] #y-axis values
  bestthresh = 0 #Will choose the best threshold based on F1 score
  bestf1 = 0
  
  #Start by setting changing the curve based on the threshold
  for thresh in range(0,n):
    thresh = (max_thresh - min_thresh) * float(thresh)/n
    thresh = thresh + min_thresh
    newpred = copy.deepcopy(pred)
    for p in newpred:
      if p[classid] > thresh:
        p[classid] = 1
      else:
        p[classid] = 0
    
    measures = [0,0,0,0] #[tp,fp,tn,fn]
    
    for p,label in zip(newpred,test_label):
      if p[classid] == 1 and label[classid] == 1:
        measures[0] += 1
      
      if p[classid] == 1 and label[classid] == 0:
        measures[1] += 1
      
      if p[classid] == 0 and label[classid] == 0:
        measures[2] += 1
      
      if p[classid] == 0 and label[classid] == 1:
        measures[3] += 1
        
    precision = float(measures[0]) / (measures[0] + measures[3])  #tp/(tp+fn)
    fpr = float(measures[1]) / (measures[1] + measures[2])  #fp/(fp+tn)
    
    
    
    fprlist += [fpr]
    tprlist += [tpr]
    
    if f1>bestf1:
      bestthresh = thresh
      bestf1 = f1
    
  return fprlist,tprlist,bestthresh
    
fpr,tpr = genROC(pred,test_label, 2000, 0.0308, 0.0309, 4, n_classes)

pyplot.plot(fpr,tpr)
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('ROC curve')
pyplot.show()