In [0]:
import csv
import pandas as pd
import numpy as np
import os
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from sklearn.metrics import roc_auc_score, f1_score

from tensorflow.keras import backend as K

In [0]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [0]:
RETRAIN_MODEL = False

In [0]:
# Hparams

TRAINABLE_LAYERS = 6
FC_LAYERS = [256, 128, 64]
DROPOUT_RATE = 0.5
EPOCHS = 2

In [0]:
# Initialize session
sess = tf.Session()

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

# We'll set sequences to be at most 256 tokens long.
max_seq_length = 256

In [0]:
# TODO(varada): Please change the paths in this cell to the appropriate local path
TRAIN_DATA = # 'gs://sfu-jigsaw/new/C3_train.csv'
TEST_DATA = # 'gs://sfu-jigsaw/new/C3_test.csv'
NYT_DATA = # 'gs://sfu-jigsaw/new/NYT_YNACC_feats_preprocessed.csv'
SOCC_DATA = # 'gs://sfu-jigsaw/new/SOCC_constructiveness_annotations_feats_preprocessed.csv'
MODEL_OUTPUT_DIR = #??

In [0]:
def df_from_path(path):
  df = pd.read_csv(tf.gfile.Open(path, 'r'))
  return df

In [0]:
train = df_from_path(TRAIN_DATA)
test = df_from_path(TEST_DATA)

In [0]:
DATA_COLUMN = 'comment_text'
LABEL_COLUMN = 'constructive_binary'
label_list = [0, 1]

In [0]:
rain_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [0]:
def create_tokenizer_from_hub_module(model_path):
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(model_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module(BERT_MODEL_PATH)

In [0]:
%%time
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, max_seq_length, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, max_seq_length, tokenizer)

In [0]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, n_fine_tune_layers, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            BERT_MODEL_PATH,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )

        trainable_vars = self.bert.variables

        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]

        print(trainable_vars)
        print(len(trainable_vars))
        # Select how many layers to fine tune
        if self.n_fine_tune_layers is not None:
          if self.n_fine_tune_layers > 0:
            trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
          elif self.n_fine_tune_layers == 0:
            trainable_vars = []
          else:
            raise ValueError('n_fine_tune_layers must be >= 0 or None.')


        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
            
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
            "pooled_output"
        ]
        return result

    def get_config(self):
      config = super().get_config()
      config['n_fine_tune_layers'] = self.n_fine_tune_layers
      return config

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [0]:
# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    dense = BertLayer(n_fine_tune_layers=TRAINABLE_LAYERS)(bert_inputs)
    for layers in FC_LAYERS:
      dense = tf.keras.layers.Dense(layers, activation='relu')(dense)
      dense = tf.keras.layers.Dropout(DROPOUT_RATE)(dense)
    pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [0]:
def get_features(features):
  input_ids, input_masks, segment_ids, labels = [], [], [], []
  for f in features:
    input_ids.append(f.input_ids)
    input_masks.append(f.input_mask)
    segment_ids.append(f.segment_ids)
    labels.append(f.label_id)
  return (
    np.array(input_ids),
    np.array(input_masks),
    np.array(segment_ids),
    np.array(labels).reshape(-1, 1),
  )

In [0]:
train_input_ids, train_input_masks, train_segment_ids, train_labels = get_features(train_features)
test_input_ids, test_input_masks, test_segment_ids, test_labels = get_features(test_features)

model = build_model(max_seq_length)

In [0]:
# Instantiate variables
initialize_vars(sess)

In [0]:
%%time
if RETRAIN_MODEL:
  model.fit(
      [train_input_ids, train_input_masks, train_segment_ids], 
      train_labels,
      validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
      epochs=EPOCHS,
      batch_size=32
  )

In [0]:
model_filename = 'BertModel_B%s_F%s_D%s_E%s.h5' % (TRAINABLE_LAYERS, len(FC_LAYERS), DROPOUT_RATE, EPOCHS)
if RETRAIN_MODEL:
  model.save_weights(os.path.join(MODEL_OUTPUT_DIR, model_filename))
else:
  model.load_weights(os.path.join(MODEL_OUTPUT_DIR, model_filename))

### Eval on Test Sets

In [0]:
def predictions_from_df(bert_model, df, data_col, label_col):
  test_InputExamples = df.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[data_col], 
                                                                   text_b = None, 
                                                                   label = x[label_col]), axis = 1)
  test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, max_seq_length, tokenizer)
  test_input_ids, test_input_masks, test_segment_ids, test_labels = get_features(test_features)
  predictions = bert_model.predict([test_input_ids, 
                                test_input_masks, 
                                test_segment_ids]
                              )
  labels = df[label_col]
  return predictions, labels

In [0]:
def metrics_from_df(bert_model, df, data_col = DATA_COLUMN, label_col = LABEL_COLUMN):
  predictions, labels = predictions_from_df(bert_model, df, data_col, label_col)
  auc = roc_auc_score(labels, predictions)
  f1 = f1_score(labels, np.round(predictions))
  return auc, f1

In [0]:
c3_test = df_from_path(TEST_DATA)

In [0]:
metrics_from_df(model, c3_test)

In [0]:
nyt_test = df_from_path(NYT_DATA)

In [0]:
nyt_test.shape

In [0]:
%%time
if RETRAIN_MODEL:
  metrics_from_df(model, nyt_test)

In [0]:
socc_test = df_from_path(SOCC_DATA)

In [0]:
socc_test['constructive_binary'] = np.round(socc_test['constructive'])

In [0]:
socc_test.shape

In [0]:
%%time
metrics_from_df(model, socc_test)

### Save Predictions for Length Analysis

In [0]:
predictions, labels = predictions_from_df(model, c3_test, DATA_COLUMN, LABEL_COLUMN)

In [0]:
output_df = c3_test[['comment_text', 'constructive_binary']]

In [0]:
output_df['comment_len'] = output_df.comment_text.apply(lambda x: len(x.strip().split()))

In [0]:
# Sanity Check
assert all(output_df['constructive_binary'] == labels)

In [0]:
output_df['prediction proba'] = predictions

In [0]:
output_df['prediction'] = np.round(predictions)

In [0]:
output_df.to_csv(tf.gfile.Open('gs://sfu-jigsaw/new/bert_scored_c3_test.csv', 'w'), index=False)