In [2]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm
from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder
from tensorflow.keras.utils import Progbar
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')


sys.path.append("bert")

In [3]:
import pickle
sharepoint_text_dir = '/mnt/Vanguard_text_rob/'
df = pickle.load(open(sharepoint_text_dir+'all_2018-06-25_w2v_all_items_tokenized_list','rb'))


In [7]:
# load all text data
# load the pickle files

sharepoint_text_dir = '/mnt/Vanguard_text_rob/'
with open('./Data/sharepoint_text_bert_pretrain.txt', 'w', encoding="utf-8", errors='ignore') as fo:
    for filename in os.listdir(sharepoint_text_dir):
        sharepoint_text_list = pickle.load(open(sharepoint_text_dir + filename,'rb'))
        for e in sharepoint_text_list:
            sent = ' '.join(e)
            fo.write(sent+'\n')


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
DEMO_MODE = False #@param {type:"boolean"}
if DEMO_MODE:
    CORPUS_SIZE = 1000000
else:
    CORPUS_SIZE = 100000000 #@param {type: "integer"}
 
!(head -n $CORPUS_SIZE './Data/transcript_text_bert_pretrain.txt') > subdataset_vanguard.txt

In [4]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
    # lowercase text
    text = str(text).lower()
    # remove non-UTF
    text = text.encode("utf-8", "ignore").decode()
    # remove punktuation symbols
    text = " ".join(regex_tokenizer.tokenize(text))
    return text

def count_lines(filename):
    count = 0
    with open(filename) as fi:
        for line in fi:
            count += 1
    return count

In [2]:
RAW_DATA_FPATH = './Data/sharepoint_text_bert_pretrain.txt' #@param {type: "string"}
PRC_DATA_FPATH = "proc_dataset_sharepoint.txt" #@param {type: "string"}
MODEL_PREFIX = "tokenizer_sharepoint" #@param {type: "string"}
VOC_SIZE = 32000 #@param {type:"integer"}
SUBSAMPLE_SIZE = 12800000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}

In [3]:
SPM_COMMAND = ('--input={} --model_prefix={} --vocab_size={} --input_sentence_size={} --shuffle_input_sentence=true --bos_id=-1 --eos_id=-1').format(PRC_DATA_FPATH, MODEL_PREFIX, VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)
spm.SentencePieceTrainer.Train(SPM_COMMAND)

True

In [6]:
# apply normalization to the dataset
# this will take a minute or two

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
    with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
        for l in fi:
            fo.write(normalize_text(l)+"\n")
            bar.add(1)



In [7]:
!mkdir ./shards_vanguard_sharepoint
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards_vanguard_sharepoint/shard_
!ls ./shards_vanguard_sharepoint/

shard_0000  shard_0005	shard_0010  shard_0015	shard_0020  shard_0025
shard_0001  shard_0006	shard_0011  shard_0016	shard_0021  shard_0026
shard_0002  shard_0007	shard_0012  shard_0017	shard_0022  shard_0027
shard_0003  shard_0008	shard_0013  shard_0018	shard_0023  shard_0028
shard_0004  shard_0009	shard_0014  shard_0019	shard_0024  shard_0029


In [4]:
def read_sentencepiece_vocab(filepath):
    voc = []
    with open(filepath, encoding='utf-8') as fi:
        for line in fi:
            voc.append(line.split("\t")[0])
    # skip the first <unk> token
    voc = voc[1:]
    return voc


snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX)) 
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token



bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab
bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))
VOC_FNAME = "vocab_sharepoint_new.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
    for token in bert_vocab:
        fo.write(token+"\n")

Learnt vocab size: 31743
Sample tokens: ['submitted', 'started', 'fe', '▁lationship', '646', '▁alwine', '▁enhancing', '▁zookeeper', '▁undermine', '▁february']
32000


In [4]:
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}
PROCESSES = 8 #@param {type:"integer"}
PRETRAINING_DIR = "pretraining_data_sharepoint" #@param {type:"string"}

XARGS_CMD = ("ls ./shards_vanguard_sharepoint/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 bert/create_pretraining_data.py "
             "--input_file=./shards_vanguard_sharepoint/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

NameError: name 'VOC_FNAME' is not defined

In [7]:
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD

INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0006
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0005
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0001
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0004
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0003
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0000
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0002
INFO:tensorflow:  ./shards_vanguard_sharepoint/shard_0007
INFO:tensorflow:*** Writing to output files ***
INFO:tensorflow:  pretraining_data_sharepoint/shard_0007.tfrecord
INFO:tensorflow:*** Example **

In [6]:
with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
    json.dump(bert_base_config, fo, indent=2)

VOC_FNAME = "vocab_sharepoint_new.txt" #@param {type:"string"}

with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
    for token in bert_vocab:
        fo.write(token+"\n")

NameError: name 'bert_vocab' is not defined

In [7]:
MODEL_DIR = "bert_model_sharepoint_large" #@param {type:"string"}

VOC_SIZE = 32000 #@param {type:"integer"}


NUM_PLACEHOLDERS = 256 #@param {type:"integer"}
tf.gfile.MkDir(MODEL_DIR)

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 1024, 
  "initializer_range": 0.02, 
  "intermediate_size": 4096, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 16, 
  "num_hidden_layers": 24, 
  "pooler_fc_size": 1024, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
  
}



In [11]:
PRETRAINING_DIR = "pretraining_data_sharepoint" #@param {type:"string"}

VOC_FNAME = "vocab_sharepoint_new.txt" #@param {type:"string"}

# Input data pipeline config
TRAIN_BATCH_SIZE = 16 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
#TRAIN_STEPS = 1000000 #@param {type:"integer"}
TRAIN_STEPS = 1000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 25000 #@param {type:"integer"}

VOCAB_FILE = VOC_FNAME
CONFIG_FILE = os.path.join(MODEL_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(MODEL_DIR)
bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(PRETRAINING_DIR, '*tfrecord'))

In [15]:
USE_TPU = False
NUM_TPU_CORES = 4

model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True)

tpu_cluster_resolver = None

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    #model_dir=BERT_GCS_DIR,
    model_dir=MODEL_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Using config: {'_model_dir': 'bert_model_sharepoint', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 25000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff525961550>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is

In [None]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use `tf.data.experimental.parallel_interleave(...)`.
Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running train on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (16, 128)
INFO:tensorflow:  name = input_mask, shape = (16, 128)
INFO:tensorflow:  name = masked_lm_ids, shape = (16, 20)
INFO:tensorflow:  name = masked_lm_positions, shape = (16, 20)
INFO:tensorflow:  name = masked_lm_weights, shape = (16, 20)
INFO:tensorflow:  name = next_sentence_labels, shape = (16, 1)
INFO:tensorflow:  name = segment_ids, shape = (16, 128)
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
INFO:tensorflow:**** Trainable Variable

In [15]:
tf2 = tf.Session(config=tf.ConfigProto(log_device_placement=True))
tf2.list_devices()

AttributeError: 'Session' object has no attribute 'Session'