<a href="https://colab.research.google.com/github/diego-feijo/bertpt/blob/master/Fine_tuning_ALBERT_for_Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning ALBERT for Fake News Detection
This kernel shows how to fine-tune an ALBERT model to a simple binary classification task.

It uses Fake.BR dataset to distinct Fake from True News. 

This is our script:
1. Setting Up the Environment
2. Download and Prepare Fake.BR Dataset
3. Create K-Folds for Evaluation
4. Fine-Tunning (Training)
5. Saving the Results to a Google Spreadsheet

Colab TPU requires a [Google Cloud Storage bucket](https://cloud.google.com/tpu/docs/quickstart). New users have [$300 free credit](https://cloud.google.com/free/) for one year.

It is mandatory to set BUCKET_NAME to be able to run on TPU. 
BUCKET_NAME is not needed if running on GPU.

MIT License

Copyright (c) [2019] [Diego de Vargas Feijo]

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## Step 1: Setting Up the Environment
Install dependencies, import libraries and authorize using Google Account.

In [0]:
!pip install --upgrade -q gspread sentencepiece

import json
import os
import logging
import tensorflow as tf
import pandas as pd
import numpy as np
import sys
import re
import unicodedata
import sklearn
import gspread

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

from google.colab import auth
auth.authenticate_user()

from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False

In [0]:
!test -d ALBERT || git clone https://github.com/google-research/ALBERT.git ALBERT

# Avoid deprecated warnings
!sed -i 's/tf.logging/tf.compat.v1.logging/' ALBERT/*.py
!sed -i 's/tf.app.run/tf.compat.v1.app.run/' ALBERT/*.py

# Remove invalid import
!sed -i 's/from __future__ import google_type_annotations/# from __future__ import google_type_annotations/' ALBERT/classifier_utils.py
# Mute too verbose output
!sed -i 's/tf.compat.v1.logging.info/# tf.compat.v1.logging.info/' ALBERT/tokenization.py

In [0]:
if not 'ALBERT' in sys.path:
  sys.path += ['ALBERT']

import tokenization

In [0]:
NAME = "fakebr"
OUTPUT_DIR = "{}_output".format(NAME)

LOWER = False # @param {type: "boolean"}

BUCKET_NAME = "<Insert Bucket Name Here>" # @param {type: "string"}
MODEL_NAME = "albert_cased_L-12_H-768_A-12" # @param {type: "string"}

## Step 2: Download and Prepare Fake.BR Dataset


In [0]:
!test -d Fake.br-Corpus || git clone https://github.com/roneysco/Fake.br-Corpus.git

In [0]:
DATA_DIR = 'Fake.br-Corpus/size_normalized_texts/'

def read_dir(directory, news_set):
  src_dir = os.path.join(directory, news_set)
  news_list = []
  for filename in os.listdir(src_dir):
    if filename.endswith('.txt'):
      with open(os.path.join(src_dir, filename)) as f:
        text = f.read()
        news_list.append([text, news_set])
  return news_list

fake_list = read_dir(DATA_DIR, 'fake')
true_list = read_dir(DATA_DIR, 'true')
dataset = fake_list + true_list

df = pd.DataFrame(dataset, columns=['text', 'label'])

In [0]:
import re
import unicodedata

# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')
  

re_line_control = re.compile(r'[\t\n\r]')
re_trim = re.compile(r'\s+', re.UNICODE)
re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
re_doublequotes_1 = re.compile(r'(\"\")')
re_doublequotes_2 = re.compile(r'(\'\')')


def normalize_text(text, lowercase=False):
  """Apply all regex above to a given string.
  Args:
    text: Text content of a section

  Returns:
    Text after being removed numbers, tags with each token separated by a
    single space"""
  text = str(text).rstrip().strip()
  text = re_line_control.sub(' ', text)
  text = re_trim.sub(' ', text)
  if lowercase:
    text = text.lower()
  text = text.encode("utf-8", "ignore").decode()
  text = text.replace('\xa0', ' ')
  text = re_quotes_1.sub(r'\1"', text)
  text = re_quotes_2.sub(r'"\1', text)
  text = re_quotes_3.sub('"', text)
  text = re.sub('"', '', text)
  text = re_doublequotes_1.sub('\"', text)
  text = re_doublequotes_2.sub('\'', text)
  return text


In [0]:
category_to_id = {'true': 0, 'fake': 1}
id_to_category = {0: 'true', 1: 'fake'}

train_x = pd.DataFrame({
    'id': [i for i in range(df.shape[0])],
    'label': df['label'].replace(category_to_id),
    'dummy': ['a'] * df.shape[0],
    'text': df['text'].apply(lambda x: normalize_text(x, LOWER))})

train_x = train_x.sample(frac=1, random_state=42).reset_index(drop=True)
train_x.head()

In [0]:
VOC_FNAME = 'tokenizer.vocab'
MDL_FNAME = "tokenizer.model"

!test -f tokenizer.tar.gz || gsutil -m cp gs://{BUCKET_NAME}/{MODEL_NAME}/tokenizer.tar.gz .
!test -f tokenizer.model || tar xzvf tokenizer.tar.gz

In [0]:
testcase = 'kkkkk boooa tarde'
bert_tokenizer = tokenization.FullTokenizer(VOC_FNAME, do_lower_case=LOWER, spm_model_file=MDL_FNAME)
print(bert_tokenizer.tokenize(testcase))

a = pd.Series([len(bert_tokenizer.tokenize(row['text'])) for index, row in train_x.iterrows()])
stats = a.describe()
print(stats)

## Step 3: Create K-Folds for Evaluation
We will split our training set in five partitions. At each fold, four will be used for training and one for evaluation.

In [0]:
from sklearn.model_selection import StratifiedKFold, train_test_split

random_state = 42

n_folds = 5

skf = StratifiedKFold(
    n_splits=n_folds,
    random_state=random_state,
    shuffle=True)

folds = []

for train_index, test_index in skf.split(train_x, train_x['label']):
  folds.append((train_index, test_index))

In [0]:
def generate_tsv(X_train, X_test):
  """ Create TSV files in the format expected by BERT """
  tf.gfile.MakeDirs('data/CoLA')
  !test -f data/CoLA/train.tsv || rm data/CoLA/*.tsv

  # We also separate 15% for evaluation during the training
  X_train, X_val = train_test_split(X_train, test_size=0.15, random_state=random_state)
  # Saving dataframes to .tsv format as required by BERT
  X_train.to_csv('data/CoLA/train.tsv', sep='\t', index=False, header=False)
  X_val.to_csv('data/CoLA/dev.tsv', sep='\t', index=False, header=False)
  X_test[['id', 'text']].to_csv('data/CoLA/test.tsv', sep='\t', index=False, header=True)
  # Remove previous fine tuning training data
  !gsutil -m rm -rf gs://{BUCKET_NAME}/{MODEL_NAME}/{OUTPUT_DIR}/

In [0]:
def train():
  BERT_MODEL_PATH = "gs://{}/{}".format(BUCKET_NAME, MODEL_NAME)
  checkpoint = tf.train.latest_checkpoint(BERT_MODEL_PATH)
  CMD = "python3 ALBERT/run_classifier.py " + \
    "--vocab_file={} ".format(VOC_FNAME) + \
    "--spm_model_file={} ".format(MDL_FNAME) + \
    "--albert_config_file={}/albert_config.json ".format(BERT_MODEL_PATH) + \
    "--init_checkpoint={} ".format(checkpoint) + \
    "--task_name=cola " + \
    "--do_train=true " + \
    "--do_eval=true " + \
    "--do_predict=true " + \
    "--data_dir=data/ " + \
    "--max_seq_length={} ".format(SEQ_LEN) + \
    "--train_batch_size={} ".format(BATCH_SIZE) + \
    "--learning_rate={} ".format(LEARNING_RATE) + \
    "--num_train_epochs={} ".format(EPOCH) + \
    "--warmup_proportion=0.1 " + \
    "--save_checkpoint_steps=1000 " + \
    "--iterations_per_loop=1000 " + \
    "--output_dir={}/{}/ ".format(BERT_MODEL_PATH, OUTPUT_DIR) + \
    "--use_tpu={} ".format(USE_TPU) + \
    "--tpu_name={} ".format(TPU_ADDRESS) + \
    "--do_lower_case={} ".format(LOWER)
  !$CMD

In [0]:
def print_divergences(test_x, y_pred, y_true):
  counter = 0
  total = test_x.shape[0]
  assert total == y_pred.shape[0]
  assert total == y_true.shape[0]
  for i in range(total):
    if y_true[i] != y_pred[i]:
      counter += 1
      print(test_x['text'].iloc[i])
      print('Predicted: {} ; Should be: {}\n'.format(
          id_to_category[y_pred[i]],
          id_to_category[y_true[i]]))
  print('Misclassifications: {}\tTotal: {}\tRate: {:8.2%}'.format(counter, total, 1(counter/total)))

In [0]:
def prepare_results(X_test):
  """ Generate/Print results according to the task
  Logistic Regression: Accuracy, Weighted F1"""
  !test -d bert_output || mkdir bert_output
  !gsutil -m mv gs://{BUCKET_NAME}/{MODEL_NAME}/{OUTPUT_DIR}/test_results.tsv bert_output/
  df_results = pd.read_csv('bert_output/test_results.tsv', sep='\t', header=None)
  y_pred = df_results.idxmax(axis=1).to_numpy()
  y_true = X_test['label'].to_numpy()

  # print_divergences(X_test, y_pred, y_true)

  weighted_f1 = sklearn.metrics.f1_score(y_true, y_pred, average='weighted')
  accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
  print('\nFold {:3}\tEpoch {:4}'.format(n+1, EPOCH))
  print('Accuracy\tWeighted F1')
  print('--------\t-----------')
  print('{:8.2%}\t{:11.3f}\n\n'.format(accuracy, weighted_f1))
  return accuracy, weighted_f1

In [0]:
from google.colab import auth
auth.authenticate_user()

from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False

## Step 4: Fine-Tunning (Training)
For each fold, we will train a different model and evaluate it.
Then we will take the average from the results.

In [0]:
BATCH_SIZE = 32
LEARNING_RATE = 4e-5
SEQ_LEN = min(int(stats['max']), 512)

results = []

for n in range(n_folds):
  # 5-fold cross-validation
  train_index, test_index = folds[n]      # Folds were already split
  X_train = train_x.loc[train_index]      # Training is 4/5 of total training
  X_test = train_x.loc[test_index]        # Test is the remaining 1/5 split

  # Generate TSV files
  generate_tsv(X_train, X_test)

  # Repeat training for this range of epochs
  for EPOCH in range(3, 4):
    # Training will be resumed from last trained epoch
    train()

    metric1, metric2 = prepare_results(X_test)
    results.append([n+1, EPOCH, metric1, metric2])

!gsutil -m rm -r gs://{BUCKET_NAME}/{MODEL_NAME}/{OUTPUT_DIR}/

## Step 5: Saving the Results to a Google Spreadsheet
Create a SpreadSheet called "albert_fakebr" and save our results. If the spreadsheet already exists, it will just append a row.

In [0]:
df_final = pd.DataFrame({
    'n': np.array([r[0] for r in results]),
    'epoch': np.array([r[1] for r in results]),
    'accuracy': np.array([r[2] for r in results]),
    'weighted_f1': np.array([r[3] for r in results])})
sum_df = df_final.groupby('epoch')[['accuracy', 'weighted_f1']].mean()
print(sum_df)

In [0]:
import string

# Just in case that the authetication has expired
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

column_names = list(string.ascii_uppercase)
column_values = {
    'Type': 'Albert',
    'Seq Len': SEQ_LEN,
    'Folds': n_folds,
    'Fold': 'N/A',
    'BS': BATCH_SIZE,
    'LR': LEARNING_RATE,
    'Model': 'albert_cased_L-12_H-768_A-12',
    'Casing': 'Uncased' if LOWER else 'Cased',
    'Accuracy': 'N/A',
    'Weighted F1': 'N/A',
  }

try:
  worksheet = gc.open("albert_" + NAME).sheet1
except gspread.SpreadsheetNotFound:
  worksheet = gc.create("albert_" + NAME).sheet1
  
  cell_list = worksheet.range('A1:{}1'.format(column_names[len(column_values)]))
  for i, key in enumerate(column_values.keys()):
    cell_list[i].value = key
  # Update in batch
  worksheet.update_cells(cell_list)

def next_available_row(worksheet):
  cells = worksheet.col_values(1)
  return len(cells) + 1

for index, row in sum_df.iterrows():
  metric1 = row['accuracy']
  metric2 = row['weighted_f1']

  column_values['Fold'] = index
  column_values['Accuracy'] = metric1
  column_values['Weighted F1'] = metric2

  next_row = next_available_row(worksheet)
  # print(next_row)

  cell_list = worksheet.range('A{}:{}{}'.format(next_row, column_names[len(column_values)], next_row))
  for i, value in enumerate(column_values.values()):
    cell_list[i].value = value
  # Update in batch
  worksheet.update_cells(cell_list)