<a href="https://colab.research.google.com/github/felipefreitas93/Colab_Notebooks/blob/master/XLNet_TPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
FRAC = 0.4
NUM_TRAIN_STEPS = 4000*FRAC
WARMUP_STEPS = 500*FRAC

## Install sentencepiece


In [0]:
!pip install sentencepiece

##Import dependencies

In [0]:
#install dependencies
import os
import csv
import tensorflow as tf
import pandas as pd  
import subprocess
import sys

## Set up the TPU and connect to Cloud Bucket

In [0]:
import datetime
import json
import pprint
import random
import string
import sys
import tensorflow as tf

print(os.environ)

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

##Download GitHub Repository

In [0]:
git_url = "https://github.com/aditya-malte/Colab-XLNet-FineTuning.git"  #@param {type:"string"}
os.system("git clone "+git_url)
%cd Colab-XLNet-FineTuning

In [0]:
!git pull origin master
#Use if you have updated git repo and want changes to reflect

##Download the IMDB dataset

In [0]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar zxf aclImdb_v1.tar.gz

def get_to_keep(frac):
    path_pos = 'aclImdb/train/pos'
    path_neg = 'aclImdb/train/neg'
    pos_df = pd.DataFrame(os.listdir(path_pos))
    to_keep_pos = pos_df.sample(frac=frac, random_state=1)[0].values.tolist()
    neg_df = pd.DataFrame(os.listdir(path_neg))
    to_keep_neg = neg_df.sample(frac=frac, random_state=1)[0].values.tolist()
    for value_pos in os.listdir(path_pos):
        if value_pos not in to_keep_pos:
            os.remove(os.path.join(path_pos ,value_pos))
    for value_neg in os.listdir(path_neg):
        if value_neg not in to_keep_neg:
            os.remove(os.path.join(path_neg ,value_neg))
            
get_to_keep(FRAC)

In [0]:
repo_name = 'Colab-XLNet-FineTuning' #@param {type:"string"}
%ls
%cd {repo_name}
!ls

# XLNet End to End (Fine-tuning + Evaluation) in 5 minutes with Cloud TPU

## Instructions

<h3><a href="https://cloud.google.com/tpu/"><img valign="middle" src="https://raw.githubusercontent.com/GoogleCloudPlatform/tensorflow-without-a-phd/master/tensorflow-rl-pong/images/tpu-hexagon.png" width="50"></a>  &nbsp;&nbsp;Train on TPU</h3>

   1. Create a Cloud Storage bucket for your TensorBoard logs at http://console.cloud.google.com/storage and fill in the BUCKET parameter in the "Parameters" section below.
 
   1. On the main menu, click Runtime and select **Change runtime type**. Set "TPU" as the hardware accelerator.
   1. Click Runtime again and select **Runtime > Run All** (Watch out: the "Colab-only auth for this notebook and the TPU" cell requires user input). You can also run the cells manually with Shift-ENTER.

In [0]:
TASK = 'IMDB' #@param {type:"string"}

TASK_DATA_DIR = 'aclImdb' #@param {type:"string"}
print('***** Task data directory: {} *****'.format(TASK_DATA_DIR))
!ls $TASK_DATA_DIR

BUCKET = 'deep_learning_bucket_mestrado' #@param {type:"string"}
assert BUCKET, 'Must specify an existing GCS bucket name'
OUTPUT_DIR = 'gs://{}/xlnet/output/{}'.format(BUCKET, TASK)
MODEL_DIR = 'gs://{}/xlnet/model/{}'.format(BUCKET, TASK)

tf.gfile.MakeDirs(OUTPUT_DIR)
tf.gfile.MakeDirs(MODEL_DIR)

print('***** Model output directory: {} *****'.format(OUTPUT_DIR))



##Download the XLNet-Large model

In [0]:
os.system("wget https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip")
os.system("unzip cased_L-24_H-1024_A-16.zip")
!ls

In [0]:
%cd xlnet_cased_L-24_H-1024_A-16
!ls

In [0]:
file_names = os.listdir(os.getcwd())
print(file_names)

##Copy the weights to Google Cloud Bucket

In [0]:
for file_name in file_names:
  print(file_name)
  os.system("gsutil cp "+ file_name + " " + MODEL_DIR)
os.system("gsutil ls " + MODEL_DIR)
%cd ..

##Copy the spiece.model file to local directory

In [0]:
os.system("gsutil cp -r " + MODEL_DIR + "/spiece.model spiece.model")
!ls

##Choose Hyperparameters

In [0]:
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 256
LEARNING_RATE = 2e-5

# Model configs
SAVE_CHECKPOINTS_STEPS = 4500
NUM_ITERATIONS = 500

##Run Training

In [0]:
train_command = "python run_classifier.py \
  --use_tpu=True \
  --use_colab_tpu=True \
  --do_train=True \
  --do_eval=False \
  --task_name="+TASK.lower()+" \
  --data_dir=./"+TASK_DATA_DIR+" \
  --output_dir="+OUTPUT_DIR+" \
  --model_dir="+MODEL_DIR+" \
  --uncased=False \
  --tpu_address="+TPU_ADDRESS+"  \
  --spiece_model_file=./spiece.model \
  --model_config_path="+MODEL_DIR+"/xlnet_config.json \
  --init_checkpoint="+MODEL_DIR+"/xlnet_model.ckpt \
  --max_seq_length="+str(MAX_SEQ_LENGTH)+" \
  --train_batch_size="+str(TRAIN_BATCH_SIZE)+" \
  --eval_batch_size="+str(EVAL_BATCH_SIZE)+" \
  --num_hosts=1 \
  --num_core_per_host=8 \
  --learning_rate="+str(LEARNING_RATE)+" \
  --train_steps="+str(NUM_TRAIN_STEPS)+" \
  --warmup_steps="+str(WARMUP_STEPS)+" \
  --save_steps="+str(SAVE_CHECKPOINTS_STEPS)+" \
  --iterations="+ str(NUM_ITERATIONS)

print(train_command)


In [0]:
!{train_command}

##Run Evaluation

In [0]:
eval_command = "python run_classifier.py \
  --use_tpu=True \
  --use_colab_tpu=True \
  --do_train=False \
  --do_eval=True \
  --eval_all_ckpt=True \
  --task_name="+TASK.lower()+" \
  --data_dir=./"+TASK_DATA_DIR+" \
  --output_dir="+OUTPUT_DIR+" \
  --model_dir="+OUTPUT_DIR+" \
  --uncased=False \
  --tpu_address="+TPU_ADDRESS+"  \
  --spiece_model_file=./spiece.model \
  --model_config_path="+MODEL_DIR+"/xlnet_config.json \
  --init_checkpoint="+MODEL_DIR+"/xlnet_model.ckpt \
  --max_seq_length="+str(MAX_SEQ_LENGTH)+" \
  --train_batch_size="+str(TRAIN_BATCH_SIZE)+" \
  --eval_batch_size="+str(EVAL_BATCH_SIZE)+" \
  --num_hosts=1 \
  --num_core_per_host=8 \
  --learning_rate=2e-5 \
  --train_steps="+str(NUM_TRAIN_STEPS)+" \
  --warmup_steps="+str(WARMUP_STEPS)+" \
  --save_steps="+str(SAVE_CHECKPOINTS_STEPS)+" \
  --iterations="+ str(NUM_ITERATIONS)

print(eval_command)


In [0]:
!{eval_command}