<a href="https://colab.research.google.com/github/hossein20s/tutorial/blob/master/BERT_TripAdvisor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

First be sure you have connection to Google Drive and Cloud Storage to read data, python files and model

In [0]:
import tensorflow as tf
import os

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

print('TPU address is', TPU_ADDRESS)
print(tf.__version__)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import sys

SRC_DIR='/content/gdrive/My Drive/src/'
REPO_DIR=SRC_DIR +  '/bert_repo/'
SRC_DIR_SHELL='/content/gdrive/My\ Drive/src/'

!mkdir $SRC_DIR_SHELL
!cd $SRC_DIR_SHELL; git clone https://github.com/google-research/bert bert_repo
if not REPO_DIR in sys.path:
  sys.path.append(REPO_DIR)

In [0]:
TASK = 'COLA'

from google.colab import auth
import pprint
import json

auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.
  
  
BUCKET = 'medicalblockchain_dev' #@param {type:"string"}
MODEL_OUT_DIR = 'test' #@param {type:"string"}
assert BUCKET, 'Must specify an existing GCS bucket name'
OUTPUT_DIR_TFHUB = 'gs://{}/bert-tfhub/models/{}'.format(BUCKET, MODEL_OUT_DIR)
tf.gfile.MakeDirs(OUTPUT_DIR_TFHUB)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR_TFHUB))

In [5]:
import pandas as pd

# from https://www.kaggle.com/madhab/jobposts & https://photos.app.goo.gl/Tqz5jvH8uhsMY94KA
#TRAIN_FILE = 'data job posts.csv.gz'

DATA_DIR = '/content/gdrive/My Drive/data/'
# https://appliedmachinelearning.blog/2017/12/21/predict-the-happiness-on-tripadvisor-reviews-using-dense-neural-network-with-keras-hackerearth-challenge/
DATA_FILE = 'trip_advisor_hackerearth_data.train.csv.gz'

data = pd.read_csv(DATA_DIR + DATA_FILE, compression='gzip')
                   #, skiprows=[0], header=None)

print(data[:3])

   User_ID                                        Description  \
0  id10326  The room was kind of clean but had a VERY stro...   
1  id10327  I stayed at the Crown Plaza April -- - April -...   
2  id10328  I booked this hotel through Hotwire at the low...   

        Browser_Used Device_Used Is_Response  
0               Edge      Mobile   not happy  
1  Internet Explorer      Mobile   not happy  
2            Mozilla      Tablet   not happy  


********************************************
You can jump on loading model from checkpint
*********************************************

In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

label = lb.fit_transform(data['Is_Response'])
text = data['Description'].replace(r'\n',' ',regex=True)

df_bert = pd.DataFrame({'user_id':data['User_ID'], 'label':label, 'alpha':['a']*data.shape[0], 'text':text})

from sklearn.model_selection import train_test_split
df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)

In [0]:
DATA_FILE = 'trip_advisor_hackerearth_data.test.csv.gz'

df_test = pd.read_csv(DATA_DIR + DATA_FILE, compression='gzip')

df_bert_test = pd.DataFrame({'User_ID':df_test['User_ID'],
                 'text':df_test['Description'].replace(r'\n',' ',regex=True)})


In [0]:
# Saving dataframes to .tsv format as required by BERT
df_bert_train.to_csv(DATA_DIR + '/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv(DATA_DIR + '/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv(DATA_DIR + 'test.tsv', sep='\t', index=False, header=True)

In [0]:
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:"string"}
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'

BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL 
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR


*******************
Loading model from Checkpoint
********************

*******  From here you need to run to load model *******

In [0]:
NUM_TRAIN_EPOCHS = 1.0
TRAIN_BATCH_SIZE = 8
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1

NUM_TPU_CORES = 8
ITERATIONS_PER_LOOP = 1000
LEARNING_RATE = 2e-5

EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
SAVE_CHECKPOINTS_STEPS = 1000

In [0]:
import run_classifier

processors = {
  "cola": run_classifier.ColaProcessor,
  "mnli": run_classifier.MnliProcessor,
  "mrpc": run_classifier.MrpcProcessor,
}
processor = processors[TASK.lower()]()
label_list = processor.get_labels()

# Compute number of train and warmup steps from batch size
train_examples = processor.get_train_examples(DATA_DIR)
num_train_steps = int(len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [0]:
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt') 
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')

In [29]:
OUTPUT_DIR = OUTPUT_DIR_TFHUB.replace('bert-tfhub', 'bert-checkpoints')
tf.gfile.MakeDirs(OUTPUT_DIR)
INIT_CHECKPOINT=OUTPUT_DIR_TFHUB

import modeling
bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)

import tokenization
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=True)

print('loading model from checkpoint directoryy {}', INIT_CHECKPOINT)

import run_classifier
model_fn = run_classifier.model_fn_builder(
  num_labels=len(label_list),
#import run_pretraining
#model_fn = run_pretraining.model_fn_builder(
  bert_config=bert_config,
  init_checkpoint=INIT_CHECKPOINT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)


loading model from checkpoint directoryy {} gs://medicalblockchain_dev/bert-tfhub/models/test


*****************************
You can jump on evaluating the model
********************************




In [0]:
# Force TF Hub writes to the GS bucket we provide.
os.environ['TFHUB_CACHE_DIR'] = OUTPUT_DIR
import run_classifier_with_tfhub
tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module(BERT_MODEL_HUB)

model_fn = run_classifier_with_tfhub.model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  bert_hub_module_handle=BERT_MODEL_HUB
)


In [0]:
import datetime
import run_classifier

MAX_SEQ_LENGTH = 400

# Train the model
def model_train(estimator):
  print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')
  # We'll set sequences to be at most 128 tokens long.
  train_features = run_classifier.convert_examples_to_features(
      train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  print('***** Started training at {} *****'.format(datetime.datetime.now()))
  print('  Num examples = {}'.format(len(train_examples)))
  print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
  tf.logging.info("  Num steps = %d", num_train_steps)
  train_input_fn = run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  print('***** Finished training at {} *****'.format(datetime.datetime.now()))


def model_eval(estimator):
  # Eval the model.
  eval_examples = processor.get_dev_examples(DATA_DIR)
  eval_features = run_classifier.convert_examples_to_features(
      eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))
  print('  Num examples = {}'.format(len(eval_examples)))
  print('  Batch size = {}'.format(EVAL_BATCH_SIZE))

  # Eval will be slightly WRONG on the TPU because it will truncate
  # the last batch.
  eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)
  eval_input_fn = run_classifier.input_fn_builder(
      features=eval_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=True)
  result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
  print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
  output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
  with tf.gfile.GFile(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
      print('  {} = {}'.format(key, str(result[key])))
      writer.write("%s = %s\n" % (key, str(result[key])))

In [0]:
# Setup TPU related config
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

def get_run_config(output_dir):
  return tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=output_dir,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))


estimator_from_checkpoints = tf.contrib.tpu.TPUEstimator(
  use_tpu=True,
  model_fn=model_fn,
  config=get_run_config(OUTPUT_DIR),
  train_batch_size=TRAIN_BATCH_SIZE,
  eval_batch_size=EVAL_BATCH_SIZE,
  predict_batch_size=PREDICT_BATCH_SIZE,
)

In [0]:
model_train(estimator_from_checkpoints)

************************************
Evaluating the model
***************************

In [34]:
model_eval(estimator_from_checkpoints)

INFO:tensorflow:Writing example 0 of 390
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: dev-0
INFO:tensorflow:tokens: [CLS] the hotel its own is not bad at all . the rooms are big enough and pretty clean . the staff ##s are nice and helpful , but please don ' t expect them to deal your request without letting you to wait . as a priority club member you can have the buffet breakfast for no charge . although there is nothing in buffet to be specially recommended , you can at least fill up yourself and prepare to leave the hotel for whole day , as there is nothing interesting in the hotel and in the area . the underground is very near to the hotel , and union square is also in walking distance , but walking in that area is really uncomfortable . there are too many beg ##gar ##s and homeless people . on the way to union square i even sm ##elt the disgusting stink from some of those people . although they did not attacked me , i would suggest to keep distance from them . never walk t

With 1 Epoch this is the eval result from tfhub

* eval_accuracy = 0.8932292
* eval_loss = 0.5588442
* global_step = 4817
* loss = 0.60744804


from checkpoint

 *  eval_accuracy = 0.9010417
 *  eval_loss = 0.4951627
 *  global_step = 4817
 *  loss = 0.4208469

Finally worked from gs://medicalblockchain_dev/bert-tfhub/models/test  
  * eval_accuracy = 0.6744792
  * eval_loss = 0.68638927
  * global_step = 0
  * loss = 0.68546134

In [0]:
!cd '/content/gdrive/My Drive/src/bert_repo'; python run_classifier.py \
--task_name=cola \
--do_train=true \
--do_eval=true \
--do_predict=true \
--data_dir='/content/gdrive/My Drive/data/' \
--vocab_file='gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/vocab.txt' \
--bert_config_file='gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_config.json' \
--init_checkpoint='gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt' \
--max_seq_length=400 \
--train_batch_size=8 \
--learning_rate=2e-5 \
--num_train_epochs=3.0 \
--output_dir='gs://medicalblockchain_dev/bert-checkpoints/models/COLA' \
--do_lower_case=True

In [0]:
from importlib import reload  # Python 3.4+ only.
import modeling
reload(modeling)