In [0]:
%tensorflow_version 1.x #version 1.0
import os
import re
import sys
import json
import pprint
import random
import string
import datetime
import warnings
import tensorflow as tf
from tensorflow import keras
warnings.filterwarnings('ignore')
from tensorflow.contrib import rnn

In [0]:
#TPU SET UP
assert 'COLAB_TPU_ADDR' in os.environ, 'NO TPU IS CONNECTED'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)
#Google Cloud Access authentication
from google.colab import auth
auth.authenticate_user()
#creating session to run in TPU
with tf.Session(TPU_ADDRESS) as session:
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

In [0]:
#IMPORTING BERT PYTHON MODULES
import sys
!test -d bert_repo || git clone https://github.com/chandu7077/mybert.git bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

In [0]:
# import python modules defined by BERT
import modeling
import optimization
import run_classifier
import run_classifier_with_tfhub
import tokenization
# import tfhub for loading the model
import tensorflow_hub as hub

In [0]:
#SETTING OUTPUT DIRECTORY AND TFHUB FOR LOADING PRETRAINED MODEL
BUCKET = 'bertbase_lstm10'
OUTPUT_DIR = 'gs://{}/{}'.format("olidtaskb",BUCKET)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('MODEL OUTPUT DIRECTORY : {0}'.format(OUTPUT_DIR))
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-12_H-768_A-12' 
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'

In [0]:
#bert tokenizer
tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module(BERT_MODEL_HUB)

In [0]:
#IMPORT DATASETS
import pandas as pd
train=pd.read_csv("olidtrainc.tsv",sep="\t")
test=pd.read_csv("olidtestc.tsv",sep="\t")
train.head()

In [0]:
#OPTIONAL WE CAN USE / NOT USE
!pip3 install emoji
import emoji
def remove_noise(tweet):
    noises = ['URL', '@USER', '\'ve', 'n\'t', '\'s', '\'m']
    #noisesb = ['URL','\'ve', 'n\'t', '\'s', '\'m']
    for noise in noises:
      tweet=str(tweet)
      tweet = tweet.replace(noise, '')
    return tweet

train.tweet=train.tweet.apply(lambda x:remove_noise(x))
test.tweet=test.tweet.apply(lambda x:remove_noise(x))

In [0]:
# Use the InputExample from bert run_classifier
DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'label'
label_list=[0,1,2]       # For task-c

class InputExample(object):
    def __init__(self, guid, text_a, text_b=None, labels=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels


train_examples = train.apply(lambda x: InputExample(guid=None,
                                                  text_a = x[DATA_COLUMN], 
                                                  text_b = None, 
                                                  label = x[LABEL_COLUMN]), axis = 1)

test_examples = test.apply(lambda x: InputExample(guid=None,
                                                  text_a = x[DATA_COLUMN], 
                                                  text_b = None, 
                                                label = x[LABEL_COLUMN]), axis = 1)

In [0]:
# NEEDED FOR BERT SEQUENCE OUTPUT WITH ATTENTION
class Attention(tf.keras.Model):
	def __init__(self, units):
		super(Attention, self).__init__()
	#trainable weights 
		self.W1 = tf.keras.layers.Dense(units)
		self.W2 = tf.keras.layers.Dense(units)
		self.V = tf.keras.layers.Dense(1)

	def call(self, features, hidden):
		#hidden shape == (batch_size, hidden size)
		hidden_with_time_axis = tf.expand_dims(hidden, 1) #(8,1,768)
		  
		#score shape == (batch_size, max_length, 1)
		score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis)) #(8,20,1)

		#attention_weights shape == (batch_size, max_length, 1)
		attention_weights = tf.nn.softmax(self.V(score), axis=1)      #(8,20,1)

		#context_vector shape after sum == (batch_size, hidden_size
		context_vector = attention_weights * features                #(8,20,768)
		context_vector = tf.reduce_sum(context_vector, axis=1)       #(8,768)
		return context_vector, attention_weights

In [0]:
def create_attention_wce_model(is_training, input_ids, input_mask, segment_ids, labels,num_labels, bert_hub_module_handle):
  tags = set()
  if is_training:
    tags.add("train")
  #loading weights from hub
  bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) 

  #structuring inputs
  bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids) 

  #giving inputs to BERT layers
  bert_outputs = bert_module(inputs=bert_inputs,signature="tokens",as_dict=True)

  #getting sequence output from the BERT  (batchsize,128,768)
  output_layer=bert_outputs["sequence_output"]

  #hidden_size=768
  hidden_size = output_layer.shape[-1].value

  #setup softmax weights and biases
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    #token embeddings of first 20 tokens 
    t1=output_layer[:,1:21]

    #token embedding of [CLS] token
    t2=output_layer[:,0]

    #setting up LSTM cell unit 
    lstmcell =  tf.nn.rnn_cell.LSTMCell(768, state_is_tuple=True)

    #setting sequence of LSTMS with 20 token embeddings as input 
    outputs, states = tf.nn.dynamic_rnn(lstmcell, t1, sequence_length=[20]*t1.shape[0].value, dtype=tf.float32)

    #output from attention layer 
    context_vector, attention_weights = Attention(1)(output_layer[:,1:21], states.h)

    #mean of context vector and [CLS] token 
    output_layer=tf.reduce_mean([t2,context_vector],0)
    

    if is_training:
      #using dropout for regularisation : dropout rate=0.9
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    # computing logits W.X + b  
    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    #calculating softmax probabilities
    probabilities = tf.nn.softmax(logits, axis=-1)

    #convert true labels to one hot 
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    # deduce weights for batch samples based on their true label
    weights = tf.reduce_sum(class_weights * one_hot_labels, axis=1)

    # compute your (unweighted) softmax cross entropy loss
    unweighted_losses = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=logits)

    # apply the weights
    per_example_loss = unweighted_losses * weights

    # reduce the result to get your final loss
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)

In [0]:
def create_lstm_wce_model(is_training, input_ids, input_mask, segment_ids, labels,num_labels, bert_hub_module_handle):
  tags = set()
  if is_training:
    tags.add("train")
  #loading weights from hub
  bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) 

  #structuring inputs
  bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids) 

  #giving inputs to BERT layers
  bert_outputs = bert_module(inputs=bert_inputs,signature="tokens",as_dict=True)

  #getting sequence output from the BERT  (batchsize,128,768)
  output_layer=bert_outputs["sequence_output"]

  #hidden_size=768
  hidden_size = output_layer.shape[-1].value

  #setup softmax weights and biases
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    #token embeddings of first 20 tokens 
    t1=output_layer[:,1:21]

    #token embedding of [CLS] token
    t2=output_layer[:,0]

    #setting up LSTM cell unit 
    lstmcell =  tf.nn.rnn_cell.LSTMCell(768, state_is_tuple=True)

    #setting sequence of LSTMS with 20 token embeddings as input 
    outputs, states = tf.nn.dynamic_rnn(lstmcell, t1, sequence_length=[20]*t1.shape[0].value, dtype=tf.float32)

    #mean of last lstm cell state vector and [CLS] token 
    output_layer=tf.reduce_mean([t2,states.h],0)
    
    if is_training:
      #using dropout for regularisation : dropout rate=0.9
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    # computing logits W.X + b  
    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    #calculating softmax probabilities
    probabilities = tf.nn.softmax(logits, axis=-1)

    #convert true labels to one hot 
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    # deduce weights for batch samples based on their true label
    weights = tf.reduce_sum(class_weights * one_hot_labels, axis=1)

    # compute your (unweighted) softmax cross entropy loss
    unweighted_losses = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=logits)

    # apply the weights
    per_example_loss = unweighted_losses * weights

    # reduce the result to get your final loss
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)

In [0]:
def create_simple_model(is_training, input_ids, input_mask, segment_ids, labels,num_labels, bert_hub_module_handle):
  tags = set()
  if is_training:
    tags.add("train")
  #loading weights from hub
  bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) 

  #structuring inputs
  bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids) 

  #giving inputs to BERT layers
  bert_outputs = bert_module(inputs=bert_inputs,signature="tokens",as_dict=True)

  #getting pooled output from the BERT  (batchsize,768)
  output_layer=bert_outputs["pooled_output"]

  #hidden_size=768
  hidden_size = output_layer.shape[-1].value

  #setup softmax weights and biases
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    
    if is_training:
      #using dropout for regularisation : dropout rate=0.9
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    # computing logits W.X + b  
    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    #calculating softmax probabilities
    probabilities = tf.nn.softmax(logits, axis=-1)

    #convert true labels to one hot 
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    # deduce weights for batch samples based on their true label
    weights = tf.reduce_sum(class_weights * one_hot_labels, axis=1)

    # compute your (unweighted) softmax cross entropy loss
    unweighted_losses = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=logits)

    # apply the weights
    per_example_loss = unweighted_losses * weights

    # reduce the result to get your final loss
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)

In [0]:
#MODEL FUNCTION BUILDER
train_hook_list= []
def model_fn_builder(num_labels, learning_rate, num_train_steps,num_warmup_steps, use_tpu, bert_hub_module_handle,model_type="lstm"):
  def model_fn(features, labels, mode, params):
    for name in sorted(features.keys()):

    # input ids which are obtained from bert tokenizer
    input_ids = features["input_ids"]

    # input masks to represent the actual sentence (1's along length of sentence , 0's for padding) and padding sequence
    input_mask = features["input_mask"]
     
    # segment ids to distinguish in case of two sentence tasks (not needed for this task)
    segment_ids = features["segment_ids"]

    # label for sentence 
    label_ids = features["label_ids"]
     
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    
    if model_type=="lstm":
      (total_loss, per_example_loss, logits, probabilities) = create_lstm_wce_model(is_training,
      input_ids, input_mask, segment_ids, label_ids, num_labels,bert_hub_module_handle)

    elif model_type=="attention":
      (total_loss, per_example_loss, logits, probabilities) = create_attention_wce_model(is_training,
      input_ids, input_mask, segment_ids, label_ids, num_labels,bert_hub_module_handle)

    elif model_type=="simple":
      (total_loss, per_example_loss, logits, probabilities) = create_simple_model(is_training,
      input_ids, input_mask, segment_ids, label_ids, num_labels,bert_hub_module_handle)
    
    #output specifications
    output_spec = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
      
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(mode=mode,loss=total_loss,train_op=train_op)

    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(per_example_loss, label_ids, logits):
        #classification metrics
        predicted_labels = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        loss = tf.metrics.mean(per_example_loss)
        out={
            "eval_accuracy": accuracy,
            "loss":loss
        }

      eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(mode=mode,loss=total_loss,eval_metrics=eval_metrics)

    elif mode == tf.estimator.ModeKeys.PREDICT:
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode, predictions={"probabilities": probabilities})
    return output_spec

  return model_fn

In [0]:
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS =3.0
MAX_SEQ_LENGTH = 128
#increasing learning rate gradually
WARMUP_PROPORTION = 0.1
SAVE_CHECKPOINTS_STEPS = 10000 
SAVE_SUMMARY_STEPS = 1000

num_train_steps = int(len(train) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

#TPU CLUSTER SETUP FOR 8 TPU CORES
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
NUM_TPU_CORES = 8
ITERATIONS_PER_LOOP = 1000

def get_run_config(output_dir):
  return tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=output_dir,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
print("no of train steps:",num_train_steps)

In [0]:
#make tensorflow hub write to gcs bucket
import os
os.environ['TFHUB_CACHE_DIR'] = OUTPUT_DIR

label_list=[0,1,2]

#setup model type
model_type="attention"

#setup modelfunction ==> create model, evaluation metrics, predictions
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  bert_hub_module_handle=BERT_MODEL_HUB,
  model_type=model_type
)

estimator_from_tfhub = tf.contrib.tpu.TPUEstimator(
  use_tpu=True,
  model_fn=model_fn,
  config=get_run_config(OUTPUT_DIR),
  train_batch_size=TRAIN_BATCH_SIZE,
  eval_batch_size=EVAL_BATCH_SIZE,
  predict_batch_size=PREDICT_BATCH_SIZE,
)

In [0]:
#Training the model
def oli_train():
  #convert examples to features in the BERT format
  train_features = run_classifier.convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  train_input_fn = run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)
  estimator_from_tfhub.train(input_fn=train_input_fn, max_steps=num_train_steps)
oli_train()

In [0]:
#testing the model
import numpy as np

def oli_test(test):
  test_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in test["tweet"]]
  test_features = run_classifier.convert_examples_to_features(test_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=test_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  predictions = estimator_from_tfhub.predict(predict_input_fn)
predictions=oli_test(test)


def test_predictions():
  ans5=[]
  for it in predictions:
    ans5.append(it)
  pred=[]
  probs=[]
  for s,v in zip(test["tweet"],ans5):
    pro=v["probabilities"]
    v=np.argmax(pro)
    pred.append(v)
    probs.append(pro[1])
  return pred,probs

prediction_labels,pos_probabilities=test_predictions()
true_labels=list(test.label)

In [0]:
#confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(true_labels,prediction_labels)

In [0]:
#classification report
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(truel,pred,target_names=["IND","GRP","OTH"]))

In [0]:
# AUC AND PRECISION RECALL CURVE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc,roc_auc_score
from matplotlib import pyplot

lr_probs = pos_probabilities
lr_precision, lr_recall, _ = precision_recall_curve(truel, lr_probs)
lr_f1, lr_auc = f1_score(truel, pred), auc(lr_recall, lr_precision,)
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
c=0
for i in truel:
  if i==0:
    c+=1
no_skill = c / len(truel)
pyplot.plot([1, 0], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='BERT LARGE')
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()