In [None]:
!pip install -q git+https://github.com/dzlab/tfnlp

[K     |████████████████████████████████| 849kB 4.7MB/s 
[K     |████████████████████████████████| 36.6MB 83kB/s 
[K     |████████████████████████████████| 358kB 54.8MB/s 
[K     |████████████████████████████████| 174kB 55.4MB/s 
[K     |████████████████████████████████| 102kB 12.1MB/s 
[K     |████████████████████████████████| 1.1MB 49.2MB/s 
[?25h  Building wheel for tfnlp (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

In [None]:
from tfnlp.classification import ClassificationModel

## Data
Download IMDb dataset from Tensoflow Datasets

In [None]:
imdb, info = tfds.load('imdb_reviews/plain_text', with_info=True, batch_size=-1)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete0L8WMY/imdb_reviews-train.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete0L8WMY/imdb_reviews-test.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete0L8WMY/imdb_reviews-unsupervised.tfrecord
[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
def load(imdb, set_type):
  if set_type != 'unsupervised':
    df = pd.DataFrame({
      'label': imdb[set_type]['label'],
      'text_a': imdb[set_type]['text']
      })
    df['label'] = df['label'].apply(lambda x: str(x))
  else:
    df = pd.DataFrame({
      'text_a': imdb['unsupervised']['text']
      })
  df['text_a'] = df['text_a'].str.decode('utf-8')
  return df

Load the dataset into a DataFarme with two columns: `text_a`, `label`.

Also, save the DataFrames as CSV files for training/validation and testing.

In [None]:
# load datasets from imdb
train_df = load(imdb, 'train')
valid_df = load(imdb, 'test')
test_df = load(imdb, 'unsupervised')
# write datasets to disk
train_df.to_csv("./train.csv", index=False, encoding='UTF-8')
valid_df.to_csv("./dev.csv", index=False)
test_df.to_csv("./test.csv", index=False)

## Training

Define training parameters

In [None]:
train_args={
  'max_seq_length': 128,
  'num_labels': 2,
  'labels': ['0', '1'],
  'num_train_epochs': 1,
  'batch_size': 32,
  'eval_batch_size': 32,
  'train_data_size': len(train_df),
  'init_lr': 2e-5,
  'do_lower_case': True,
  # optimizer params
  'use_float16': False,
  'use_graph_rewrite': False,
  # distribution strategy params
  'distribution_strategy': 'one_device',
  'num_gpus': 1,
  'tpu': None,
}

### Load model from checkpoint

In [None]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12"

#### Train a classifier with data from CSV

In [None]:
# Create a ClassificationModel
model = ClassificationModel.from_checkpoint(gs_folder_bert, train_args)
# Train the model
model.train(data_path='.')

labels, label_ids = model.evaluate(data_path='.')



#### Train a classifier with data from DataFrame

In [None]:
# Create a classifier
model = ClassificationModel.from_checkpoint(gs_folder_bert, train_args)
# Train the classifier
model.train(train_df=train_df, valid_df=valid_df)
# Test the classifier
labels, label_ids = model.evaluate(eval_df=test_df)



### Load model from TF Hub

In [None]:
hub_model_name = "bert_en_uncased_L-12_H-768_A-12" #@param ["bert_en_uncased_L-24_H-1024_A-16", "bert_en_wwm_cased_L-24_H-1024_A-16", "bert_en_uncased_L-12_H-768_A-12", "bert_en_wwm_uncased_L-24_H-1024_A-16", "bert_en_cased_L-24_H-1024_A-16", "bert_en_cased_L-12_H-768_A-12", "bert_zh_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12"]

#### Train a classifier with data from CSV

In [None]:
# Download base model
hub_url_bert = f"https://tfhub.dev/tensorflow/{hub_model_name}/2"
model = ClassificationModel.from_tfhub(hub_url_bert, train_args)
# Train the model
model.train(data_path='.')
# Test the classifier
labels, label_ids = model.evaluate(data_path='.')



#### Train a classifier with data from DataFrame

In [None]:
# Download base model
hub_url_bert = f"https://tfhub.dev/tensorflow/{hub_model_name}/2"
model = ClassificationModel.from_tfhub(hub_url_bert, train_args)
# Train the model
model.train(train_df=train_df, valid_df=valid_df)
# Test the classifier
labels, label_ids = model.evaluate(eval_df=test_df)

