# PRETRAİNİNG BERT WITH CLOUD TPU

---

KAYNAK: https://towardsdatascience.com/pre-training-bert-from-scratch-with-cloud-tpu-6e2f71028379

In [0]:
!pip install sentencepiece
!git clone https://github.com/google-research/bert

import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
from google.colab import auth, drive
from tensorflow.keras.utils import Progbar

sys.path.append("bert")

from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

auth.authenticate_user()
  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False

## VERİ SETİ: 

---
http://opus.nlpl.eu/OpenSubtitles-v2016.php

Bu veri setinin 65 farklı dil seçeneği vardır. Veriler satır satır olacak şekildedirler.

In [0]:
AVAILABLE =  {'af','ar','bg','bn','br','bs','ca','cs',
              'da','de','el','en','eo','es','et','eu',
              'fa','fi','fr','gl','he','hi','hr','hu',
              'hy','id','is','it','ja','ka','kk','ko',
              'lt','lv','mk','ml','ms','nl','no','pl',
              'pt','pt_br','ro','ru','si','sk','sl','sq',
              'sr','sv','ta','te','th','tl','tr','uk',
              'ur','vi','ze_en','ze_zh','zh','zh_cn',
              'zh_en','zh_tw','zh_zh'}

LANG_CODE = "tr" 

!wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.'$LANG_CODE'.gz -O dataset.txt.gz
!gzip -d dataset.txt.gz
!tail dataset.txt

In [0]:
CORPUS_SIZE = 700000 # KULLANILACAK OLAN VERİ BOYUTUNU BELİRLEDİK

!(head -n $CORPUS_SIZE dataset.txt) > train.txt #EĞİTİM İÇİN KULLANILACAK VERİ SETİ OLUŞTURULDU
!(tail -n 200000 dataset.txt) > test.txt #TEST İÇİN KULLANILACAK VERİ SETİ OLUŞTURULDU


## PREPROCESSING ROW DATA:

---



1.   harfleri küçük harfe çevirme
2.   noktalama işareterini silme
3. rakamları silme
4. türkçe karakter dönüşümü yapma



In [0]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")
import re

def normalize_text(text):
  text= text.replace('I','i')
  text= text.replace('İ','i')
  
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  pattern = r"[{}]".format('&+#*·–’“•\'"?!,.():;><_/-') 
  text = re.sub(pattern, " ", text) 
  
  text = re.sub(r'[0-9]+', ' ', text) 
  text= text.replace('ş','s')
  text= text.replace('ı','i')
  text= text.replace('ö','o')
  text= text.replace('ü','u')
  text= text.replace('ğ','g')
  text= text.replace('ç','c')
  
  # remove punktuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

In [0]:
normalize_text('İSTANBUL ben geldim. 39 :)))')

In [0]:
normalize_text('ISTANBUL ben geldim. 39 :)))')

In [0]:
normalize_text('iSTANBUL ben geldim. 39 :)))')

In [0]:
normalize_text('ıSTANBUL ben geldim. 39 :)))')

In [0]:
RAW_DATA_FPATH = "train.txt" 
PRC_DATA_FPATH = "cleaned_train.txt"

# apply normalization to the dataset
total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)

In [0]:
RAW_DATA_TEST_FPATH = "test.txt" 
PRC_DATA_TEST_FPATH = "cleaned_test.txt"

# apply normalization to the dataset
test_lines = count_lines(RAW_DATA_TEST_FPATH)
test_bar = Progbar(test_lines)

with open(RAW_DATA_TEST_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_TEST_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      test_bar.add(1)

## VOCABULARY OLUŞTURMA:

---
BERT paper da tokenization için wordPiece tokinizer kullanılmaktadır ama bunun açık kaynak  kodu bulunmuyor, bu yüzden SentencePiece tokinizer kullanıldı(unigram modda). wordpiece tokinizerda kelime eklerinin başına ## geliyorken, sentincepiece de __ gelmektedir. Bu yüzden __ ile başlayan  kelimelerim başına __ yerine ## konulmuştur. placeholder token sayısı, vocabulary'i güncellemek istersek veya trainden sonra fine-tuning için belirtilmiştir, placeholder tokenlar yerine task-specific tokenlar gelecektir. Bu durumda,placeholder tokenlarının yerine yenileri gelecektir ve bu şekilde eğitim öncesi veriler yeniden oluşturulur, model yeni verilere göre ayarlanır.


In [0]:
MODEL_PREFIX = "tokenizer" 
VOC_SIZE = 40000
SUBSAMPLE_SIZE = 1280000
NUM_PLACEHOLDERS = 128

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

In [0]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

In [0]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token

In [0]:
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

In [0]:
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

In [0]:
bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

In [0]:
VOC_FNAME = "vocab.txt" 

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

## PRE-TRAİNİNG VERİ OLUŞTURMA:

---
Veri BERT modelin eğitimi için uygun hale getiriliyor.


In [0]:
!mkdir ./shards #VERİ SETİ BÜYÜK OLDUĞU İÇİN VERİLERİ SHARDLARA BÖLÜYORUZ, 
#BU ÖRNEKTE SADECE 100.000 ÖRNEK OLDUĞU İÇİN TEK BİR SHARD OLUŞUYOR
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

In [0]:
MAX_SEQ_LENGTH = 128 
MASKED_LM_PROB = 0.15
MAX_PREDICTIONS = 20 
DO_LOWER_CASE = True 
PROCESSES = 2 
PRETRAINING_DIR = "pretraining_data" 

In [0]:
#her shard için bertin create_pretraining_data.py, belirlenen parametreler ile çalıştırılıyor
XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 bert/create_pretraining_data.py "
             "--input_file=./shards/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

In [0]:
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD

## DEPOLAMATI AYARLAMA: GOOGLE CLOUD STORAGE

---

CONSOLE:https://console.cloud.google.com/storage/
KULLANILMASININ SEBEBİ VERİ SETİNİN BERTLE EĞİTİMİ SAATLER SÜRDÜĞÜ İÇİN KALDIĞI YERDEN DEVAM ETME, VERİLERİ KAYDETMEK İÇİN OLUŞTURULAN PAKETTE EĞİTİM İÇİN KULLANILANLAR VE ELDE EDİLEN AĞIRLIKLAR SAKLANMAKTADIR, AYRICA SONRAKİ KULLANIM İÇİN DE FAYDALI OLACAKTIR.

In [0]:
project_id ='arctic-analyzer-254614'

In [0]:
!gcloud config set project {project_id}

In [0]:
!gsutil mb gs://bert_resourse/

In [0]:
BUCKET_NAME = "bert_resourse" 
MODEL_DIR = "bert_model"
tf.gfile.MkDir(MODEL_DIR)

if not BUCKET_NAME:
  log.warning("WARNING: BUCKET_NAME is not set. "
              "You will not be able to train the model.")

In [0]:
# use this for BERT-base, HYPER-PARAMETER konfigürasyonu yapılmıştır.

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
  json.dump(bert_base_config, fo, indent=2)
  
with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [0]:
if BUCKET_NAME:
  !gsutil -m cp -r $MODEL_DIR $PRETRAINING_DIR gs://$BUCKET_NAME

# TRAINING 


In [0]:
BUCKET_NAME = "bert_resourse" 
MODEL_DIR = "bert_model" 
PRETRAINING_DIR = "pretraining_data" 
VOC_FNAME = "vocab.txt" 

# Input data pipeline config
TRAIN_BATCH_SIZE = 128 
MAX_PREDICTIONS = 20 
MAX_SEQ_LENGTH = 128 
MASKED_LM_PROB = 0.15 

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 700000 
SAVE_CHECKPOINTS_STEPS = 2500 
NUM_TPU_CORES = 8

if BUCKET_NAME:
  BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
  BUCKET_PATH = "."

BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

In [0]:
model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True)

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)

Fire!

In [0]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)