Code adapted from [this Bert notebook](https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb)

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from tensorflow import keras
import os
import re
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert_serving.client import BertClient

In [6]:
train_data_path = "Data/Train/mturk_train.txt"
train_data_x_col = "inputtext"
train_data_y_col = "count_pos"
label_col = "polarity"
output_dir = "Output"
models_dir = "Models"
label_list = [0,1]

#Load data, calculate polarity, shuffle, and split into test/train 
#Polarity rule: if two or more positive ratings, then positve. otherwise, not positive 
def load_dataset(directory):
    df = pd.read_csv(directory)
    df[label_col] = (df[train_data_y_col] >= 2).astype(int)
    shuffled_df = df[[train_data_x_col, label_col]].sample(frac = 1).reset_index(drop = True)
    train, test = train_test_split(shuffled_df, test_size = .1)
    return train, test 

In [7]:
train, test = load_dataset(train_data_path)

print("Train set positive polarity ratio:", sum(train['polarity']),len(train.index)) 
print("Test set positive polarity ratio:",sum(test['polarity']), len(test.index))

Train set positive polarity ratio: 719 1240
Test set positive polarity ratio: 82 138


In [8]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[train_data_x_col], 
                                                                   text_b = None, 
                                                                   label = x[label_col]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[train_data_x_col], 
                                                                   text_b = None, 
                                                                   label = x[label_col]), axis = 1)


Make sure you start the bert-serving server first:

In [11]:
#Preprocess data using Bert functions 
vocab_dir = "/Users/guanzhi0/Documents/Anita_Rao_RP/NLP/scibert_scivocab_uncased/vocab.txt"
MAX_SEQ_LENGTH=500 #Figure out how to optimize this parameter 

tokenizer = bert.tokenization.FullTokenizer(vocab_file= vocab_dir) #Lowercase = True by default
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 1240
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] the objectives of this research were to determine the phenolic contents , oxygen radical absorbance capacities ( or ##ac ) , cellular antioxidant activities ( ca ##a ) , and anti ##proliferative capacities of nine oa ##t varieties and four bran ##s in china . of all varieties , long ##yan 3 and bei ##yan 1 exhibited the highest total ave ##nan - thr ##amide ##s ( 146 . 94 + / - 7 . 31 and 120 . 95 + / - 6 . 66 mu g / g , respectively ) and or ##ac values ( 21 . 03 + / - 0 . 56 and 21 . 18 + / - 1 . 45 mu m tro ##lox / g , respectively ) , while sha ##oto ##ng exhibited the highest total phenolic acids ( 143 . 52 + / - 9 . 42 mu g / g ) and ca ##a values ( 33 . 38 + / - 1 . 74 mu m quercetin / 100 g ) . the ec ##50 of anti ##proliferative capacities ranged from 167 . 31 + / - 6 . 42 to 233 . 42 + / - 21 . 31 mu g / ml , with the lowest in bei ##xia ##o 8 whi

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

INFO:tensorflow:label: 1 (id = 1)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] oa ##t kernels were extracted with methanol , and glycol ##ip ##id - enriched fractions were prepared using silica solid phase extraction . using direct infusion electros ##pray ionization ( esi ) tandem mass spectrometry ( ms ) , high performance liquid chromatography ( hplc ) - esi - ms , and hplc - atmospheric pressure chemical ionization ( apc ##i ) - ms , we confirmed previous reports that dig ##ala ##ct ##osyl ##dia ##cy ##lg ##lycer ##ol ( dg ##dg ) was the most abundant glycol ##ip ##id in oa ##t kernels and confirmed a previous report of the presence of a dg ##dg mono - est ##oli ##de in oa ##t kernels . in the current study we also identified several additional natural galact ##olipid est ##oli ##des : two new dg ##dg est ##oli ##des ( di - and tri - est ##oli ##des ) , two trig ##ala ##ct ##osyl ##dia ##cy ##lg ##lycer ##ol ( trig ##dg ) est ##oli ##des 

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

INFO:tensorflow:input_ids: 102 22757 24999 6407 4694 241 501 147 23533 787 579 791 6622 205 502 16093 137 20552 248 7423 791 191 1676 1827 263 3008 3336 6024 30108 787 234 106 6622 2418 130 357 9862 131 28393 579 370 29412 114 241 1815 205 256 241 2861 198 16093 5796 434 2731 2973 6940 137 4487 1784 1031 147 494 787 324 7423 205 147 1658 5796 2087 422 244 205 305 422 158 422 137 158 205 305 1863 145 4466 1352 4466 546 131 111 29412 114 267 8284 690 111 5796 205 1425 147 111 545 422 5796 13601 121 1506 2664 969 633 3858 137 1506 12766 9905 137 4487 1784 267 1357 3007 205 1074 147 111 1735 131 158 205 305 1863 145 4466 1352 4466 546 131 29412 114 422 111 6845 11986 1175 1357 137 422 1104 422 5860 3101 1633 145 7720 546 131 20552 248 5796 241 469 1175 205 867 131 29412 270 1544 302 1357 2606 3451 1784 205 111 545 405 198 10265 131 22757 24999 28393 29412 270 690 106 5796 1151 263 106 2404 4089 552 405 14783 137 8112 633 1784 205 4619 2040 11139 16146 131 643 7423 137 2702 131 547 1882 220

INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

INFO:tensorflow:input_ids: 102 111 2141 131 5046 6024 30108 234 21491 6130 579 2972 133 28724 8284 235 643 1049 145 158 205 239 579 239 205 514 1863 3799 6130 579 2972 133 1840 422 22363 2525 546 690 5050 117 579 2159 145 10622 546 8058 579 791 18845 30117 12121 14064 121 1506 2118 145 8765 579 20522 1863 422 22363 2525 546 191 18845 30117 6746 665 145 4808 137 4900 2732 546 137 16310 8650 1150 434 528 3003 205 111 907 131 111 14576 140 624 191 111 2118 137 2271 1929 131 6130 579 2972 133 121 111 2531 16310 434 528 469 2840 205 111 6746 872 1784 131 111 18845 30117 267 10627 6452 214 18845 30117 1506 2118 1814 259 205 139 205 3378 131 18845 30117 20149 165 3462 131 1916 3243 191 6017 10622 16310 1671 205 111 16310 1154 1894 241 8307 3968 190 111 18845 30117 6845 11986 145 159 2505 546 137 111 11212 145 365 30110 145 244 546 546 422 137 7189 190 111 1738 16186 205 235 8041 20149 422 111 6746 872 1784 131 21491 6130 579 2972 323 579 10879 18845 4516 137 111 1671 5734 131 16310 30113 2865

In [None]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


In [17]:
#Preprocess data using Bert-As-Service

#Run tokenizer 
bc = BertClient()
bc.encode(['First do it', 'then do it right', 'then do it better'])


array([[-0.2021648 ,  0.5860976 ,  0.3007005 , ..., -0.22786932,
        -1.118817  , -1.6218876 ],
       [-0.9322241 ,  0.3870859 ,  0.25843135, ..., -0.17596966,
        -0.4444375 , -1.8124986 ],
       [-0.959994  ,  0.2677812 ,  0.06218748, ..., -0.33035463,
        -0.5957392 , -1.4234552 ]], dtype=float32)

OSError: SavedModel file does not exist at: /Users/guanzhi0/Documents/Anita_Rao_RP/NLP/scibert_scivocab_uncased//{saved_model.pbtxt|saved_model.pb}