<a href="https://colab.research.google.com/github/fawazshah/News-Media-Reliability/blob/master/train_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 8.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 32.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 39.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=f4c26

In [2]:
import json
import numpy as np
import pandas as pd
import random
import requests
from sklearn.metrics import f1_score, accuracy_score, classification_report
import time
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [3]:
# Setting random seed and device
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

### Utils

In [4]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = self._original_stdout

### Loading data

In [5]:
corpus_url = 'https://raw.githubusercontent.com/fawazshah/News-Media-Reliability/master/data/emnlp18/corpus-modified.tsv'

corpus = pd.read_csv(corpus_url, sep='\t')
urls = corpus['source_url_normalized'].values

# Ground truths
biases = corpus['bias'].values

In [6]:
article_data_json_url = 'https://raw.githubusercontent.com/fawazshah/News-Media-Reliability/master/data/scraped_articles.json'

r = requests.get(article_data_json_url)
article_data = r.json()

In [7]:
all_data_df = pd.DataFrame(columns=['article headline', 'article body', 'bias'])

news_sources_scraped = 0

for row in corpus.itertuples():
    url = row.source_url_normalized
    bias = row.bias
    if article_data["newspapers"][url] is not None:
        articles = article_data["newspapers"][url].get("articles", [])
        if len(articles) > 0:
            news_sources_scraped += 1
            for article in articles:
                all_data_df = all_data_df.append({'article headline': article['title'],
                                                  'article body': article['text'],
                                                  'bias': bias}, ignore_index=True)

In [8]:
all_data_df

Unnamed: 0,article headline,article body,bias
0,On the Ground at the Inauguration: The Only Th...,"Will Sennott\n\nWEDNESDAY, JANUARY 20, 2021, W...",left
1,"Under President Biden, Will the Yankees Return...",Thurman Munson and Reggie Jackson in 1977 From...,left
2,Gun Rights Absolutists Celebrate Martin Luther...,"Will Sennott\n\nMONDAY, JANUARY 18, 2021, RICH...",left
3,Thugs in Blue,"THE BEAT GOES ON … AND ON\n\nOnce Again, Polic...",left
4,HELL YEAH! Sheriff Clark Publicly DISEMBOWELS ...,Al Sharpton always has had a couple screws loo...,right
...,...,...,...
1649,UK Educators Rank-and-File Safety Committee di...,The UK Educators Rank-and-File Safety Committe...,left
1650,Make It Sing,Before I lay into the Democrats for missed opp...,left
1651,Bill Maher: The SPIN Interview,If you care at all about democracy and the way...,left
1652,Stephan Jenkins on What Culture Truly Means,"“When bad men combine, the good must associate...",left


### Text preprocessing

In [9]:
# Text preprocessing preparation

stop_words = ["the", "a", "an", "as", "this", "that", "is", "and", "or", "on",
              "at", "to", "in", "by", "than", "of", "for", "be", "i", "you", 
              "he", "she", "his", "her", "do", "it", "with"]

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

nltk.download('wordnet')

# required for tokenization
nltk.download('punkt')

# required for POS tagging
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [10]:
# Text preprocessing performed on both article headline and article body

def preprocess(sentence):

    # Lowercase
    sentence = sentence.lower()

    # Punctuation, whitespace removal
    punctuations = '''!()-—[]{};:'"“”‘’\,<>./?@#$%^&*_~'''
    whitespace = '''\n\t'''

    for ch in sentence: 
        if ch in punctuations: 
            sentence = sentence.replace(ch, "")
        if ch in whitespace:
            sentence = sentence.replace(ch, " ")

    # Stop word removal
    remaining_words = []
    
    for word in sentence.split():
        if word not in stop_words:
            remaining_words.append(word)

    sentence = " ".join(remaining_words)

    # Lemmatization
    lemmatized_words = []

    # In order to lemmatise we must first POS-tag each sentence
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)

    for word, tag in tagged:
        pos = nltk_tag_to_wordnet_tag(tag) 
        if pos is not None:
            word = lemmatizer.lemmatize(word, pos=pos)

        lemmatized_words.append(word)

    sentence = " ".join(lemmatized_words)
    
    return sentence

In [11]:
start = time.time()
all_data_df['article headline'] = all_data_df['article headline'].apply(preprocess)
print(f"Preprocessing headlines took {time.time() - start} seconds")

start = time.time()
all_data_df['article body'] = all_data_df['article body'].apply(preprocess)
print(f"Preprocessing article bodies took {time.time() - start} seconds")

Preprocessing headlines took 3.0772948265075684 seconds
Preprocessing article bodies took 61.31266236305237 seconds


In [12]:
# Encode labels as numbers
# center == 0
# left == 1
# right == 2

def encode_labels(label):
    if label == "center":
        return 0
    elif label == "left":
        return 1
    else:
        return 2

all_data_df['bias'] = all_data_df['bias'].apply(encode_labels)

In [13]:
all_data_df['bias'].value_counts()

2    657
0    567
1    430
Name: bias, dtype: int64

In [14]:
# Randomly shuffle rows in dataset before splitting into folds
all_data_df = all_data_df.sample(frac=1, random_state=1)
all_data_df.reset_index(drop=True)

Unnamed: 0,article headline,article body,bias
0,bidens america one nation us versus them,president joe biden sworn 46th president janua...,2
1,how get covid19 vaccine miamidade broward,keep new time free support us local community ...,1
2,arm mob storm capitol building during electora...,day will go down infamy arm mob storm united s...,1
3,frontier ebook release january 2021,download month new release include late specia...,0
4,change date vaccine news angry cricket coach,clancy overell wendell hussey kick off another...,0
...,...,...,...
1649,legal liability loom orgs behind rally incite ...,legal liability loom orgs behind rally incite ...,1
1650,merck france pasteur institute end development...,covid19 pandemic underscore need our company o...,0
1651,anthony mackie responsibility message captain ...,anthony mackie clear not all say he new captai...,1
1652,union just get rare bit good news from supreme...,supreme court announce monday will not hear bl...,1


### Splitting data into folds

In [15]:
# 5 folds, each with 70% training, 10% validation, 20% train

num_folds = 5

fold_size = round(len(all_data_df) / num_folds)
fold_dfs = [all_data_df.iloc[i*fold_size:(i+1)*fold_size].copy() for i in range(num_folds)]

In [16]:
folds = {}

for i, df in enumerate(fold_dfs):
    folds[i] = {}
    split_point_1 = int(0.7*len(df))
    split_point_2 = int(0.8*len(df))
    folds[i]["train_df"] = df.iloc[:split_point_1].copy()
    folds[i]["val_df"] = df.iloc[split_point_1:split_point_2].copy()
    folds[i]["test_df"] = df.iloc[split_point_2:].copy()

In [17]:
print(f"Number of folds: {num_folds}")
print(f"Size of each training set: {len(folds[0]['train_df'])}")
print(f"Size of each validation set: {len(folds[0]['val_df'])}")
print(f"Size of each test set: {len(folds[0]['test_df'])}")

Number of folds: 5
Size of each training set: 231
Size of each validation set: 33
Size of each test set: 67


### BERT model

In [21]:
loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
# Compute the length of the longest sentence in particular column out of
# all train, val and test data
def compute_max_length(col_to_encode):

  sentences = all_data_df[col_to_encode].values

  max_len = 0

  for sent in sentences:

      # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
      input_ids = tokenizer.encode(sent, add_special_tokens=True)

      # Update the maximum sentence length.
      max_len = max(max_len, len(input_ids))

  return max_len

In [24]:
max_len_headline = compute_max_length('article headline')
max_len_body = compute_max_length('article body')

print(f"Max headline length across all folds: {max_len_headline}")
print(f"Max article body length across all folds: {max_len_body}")

Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors


Max headline length across all folds: 103
Max article body length across all folds: 14373


In [25]:
def create_bert_dataset(df, col_to_encode, max_sequence_len):
    # Returns a TensorDataset of sequences in col_to_encode column of df

    token_ids = []
    token_type_ids = [] # segment ids 
    attention_masks = []

    sentences = df[col_to_encode].values.tolist()
    for sent in sentences:
        encoding_dict = tokenizer(sent,
                                  add_special_tokens=True,
                                  max_length=max_sequence_len,
                                  padding='max_length',
                                  truncation=True,
                                  return_token_type_ids = True,
                                  return_attention_mask = True,
                                  return_tensors = 'pt'
                                  )
        token_ids.append(encoding_dict['input_ids'])
        token_type_ids.append(encoding_dict['token_type_ids'])
        attention_masks.append(encoding_dict['attention_mask'])
    
    token_ids = torch.cat(token_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['bias'].values)
    
    return TensorDataset(token_ids, token_type_ids, attention_masks, labels)

In [26]:
dataloaders = {}
batch_size = 10

# Note we manually shuffled the dataset earlier, so we can use SequentialSampler to
# sample instead of RandomSampler

dataloaders['headlines'] = {}
for i in range(num_folds):
    dataloaders['headlines'][i] = {}
    train_dataset = create_bert_dataset(folds[i]['train_df'], 'article headline', max_len_headline)
    dataloaders['headlines'][i]['train'] = DataLoader(train_dataset, sampler=SequentialSampler(train_dataset), batch_size=batch_size)
    val_dataset = create_bert_dataset(folds[i]['val_df'], 'article headline', max_len_headline)
    dataloaders['headlines'][i]['val'] = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
    test_dataset = create_bert_dataset(folds[i]['test_df'], 'article headline', max_len_headline)
    dataloaders['headlines'][i]['test'] = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

dataloaders['bodies'] = {}
for i in range(num_folds):
    dataloaders['bodies'][i] = {}
    train_dataset = create_bert_dataset(folds[i]['train_df'], 'article body', max_len_body)
    dataloaders['bodies'][i]['train'] = DataLoader(train_dataset, sampler=SequentialSampler(train_dataset), batch_size=batch_size)
    val_dataset = create_bert_dataset(folds[i]['val_df'], 'article body', max_len_body)
    dataloaders['bodies'][i]['val'] = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
    test_dataset = create_bert_dataset(folds[i]['test_df'], 'article body', max_len_body)
    dataloaders['bodies'][i]['test'] = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

In [27]:
def train_BERT(train_dataloader, val_dataloader, model, number_epoch):

    train_loss = []
    valid_loss = []

    optimizer = AdamW(model.parameters(),
                    lr = 2e-5, 
                    eps = 1e-8 
                )

    # Create the learning rate scheduler.
    total_steps = len(train_dataloader) * number_epoch
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=total_steps)

    for epoch in range(1, number_epoch+1):

        # TRAINING

        time0 = time.time()

        model.train()

        epoch_train_loss = 0
        no_observations = 0
        epoch_train_predictions = []
        epoch_train_labels = []

        for batch in train_dataloader:

            # Each batch contains token ids, token type ids, attention masks and labels
            b_token_ids = batch[0].to(device)
            b_token_type_ids = batch[1].to(device)
            b_attention_masks = batch[2].to(device)
            b_labels = batch[3].to(device)

            no_observations = no_observations + b_labels.shape[0]
            
            output = model(b_token_ids, 
                    token_type_ids=b_token_type_ids, 
                    attention_mask=b_attention_masks, 
                    labels=b_labels)

            model.zero_grad()

            loss = output.loss
            logits = output.logits

            predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = b_labels.detach().cpu().numpy()
            epoch_train_predictions.extend(predictions)
            epoch_train_labels.extend(labels)

            loss.backward()
            # Clip the norm of the gradients to 1 to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step() 

            # Update the learning rate using the scheduler
            scheduler.step()  

            epoch_train_loss += loss.item()*b_labels.shape[0]

        epoch_train_loss, epoch_train_acc = epoch_train_loss / no_observations, accuracy_score(epoch_train_labels, epoch_train_predictions)

        # VALIDATION

        epoch_valid_loss, epoch_val_predictions, epoch_val_labels = evaluate_BERT(val_dataloader, model)
        epoch_valid_acc = accuracy_score(epoch_val_labels, epoch_val_predictions)

        # FINALLY

        print(f"Epoch took: {time.time() - time0}")

        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_train_loss:.2f} | Train Accuracy: {epoch_train_acc:.2f} | \
        Val. Loss: {epoch_valid_loss:.2f} | Val. Accuracy: {epoch_valid_acc:.2f} |')

        train_loss.append(epoch_train_loss)
        valid_loss.append(epoch_valid_loss)
    
    return train_loss, valid_loss

In [31]:
def evaluate_BERT(test_dataloader, model):

    model.eval()
    total_loss = 0
    no_observations = 0
    predictions_all = []
    labels_all = []

    with torch.no_grad():
        for batch in test_dataloader:
            b_token_ids = batch[0].to(device)
            b_token_type_ids = batch[1].to(device)
            b_attention_masks = batch[2].to(device)
            b_labels = batch[3].to(device)

            no_observations += b_labels.shape[0]
            output = model(b_token_ids, token_type_ids=b_token_type_ids, 
                                        attention_mask=b_attention_masks)
            logits = output.logits
            loss = loss_fn(logits, b_labels)

            predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = b_labels.detach().cpu().numpy()
            predictions_all.extend(predictions)
            labels_all.extend(labels)

            total_loss += loss.item()*b_labels.shape[0]
    
    return total_loss / no_observations, predictions_all, labels_all

In [30]:
num_epochs = 5

data_types = ['headlines', 'bodies']

for data_type in data_types:
    for i in range(num_folds):

        print(f"-------------------")
        print(f"{data_type.upper()} - FOLD {i}")
        print(f"-------------------")

        # Set up a new BERT model
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=3,
            output_attentions = False,
            output_hidden_states = False,
        )
        model.cuda()

        # Train model
        train_dataloader = dataloaders[data_type][i]['train']
        val_dataloader = dataloaders[data_type][i]['val']
        train_loss, valid_loss = train_BERT(train_dataloader, val_dataloader, model, num_epochs)

        # Test model
        test_dataloader = dataloaders[data_type][i]['test']
        _, predictions, labels = evaluate_BERT(test_dataloader, model)

        print(classification_report(labels, predictions))

-------------------
HEADLINES - FOLD 0
-------------------


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

NameError: ignored