<a href="https://colab.research.google.com/github/fawazshah/News-Media-Reliability/blob/master/train_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [81]:
!pip install transformers



In [82]:
import json
import pandas as pd
import requests
import stanza
import time
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import TensorDataset

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

### Utils

In [83]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

### Loading data

In [84]:
corpus_url = 'https://raw.githubusercontent.com/fawazshah/News-Media-Reliability/master/data/emnlp18/corpus-modified.tsv'

corpus = pd.read_csv(corpus_url, sep='\t')
urls = corpus['source_url_normalized'].values

# Ground truths
biases = corpus['bias'].values

In [85]:
article_data_json_url = 'https://raw.githubusercontent.com/fawazshah/News-Media-Reliability/master/data/scraped_articles.json'

r = requests.get(article_data_json_url)
article_data = r.json()

In [86]:
all_data_df = pd.DataFrame(columns=['article headline', 'article body', 'bias'])

news_sources_scraped = 0

for row in corpus.itertuples():
    url = row.source_url_normalized
    bias = row.bias
    if article_data["newspapers"][url] is not None:
        articles = article_data["newspapers"][url].get("articles", [])
        if len(articles) > 0:
            news_sources_scraped += 1
            for article in articles:
                all_data_df = all_data_df.append({'article headline': article['title'],
                                                  'article body': article['text'],
                                                  'bias': bias}, ignore_index=True)

In [87]:
all_data_df

Unnamed: 0,article headline,article body,bias
0,On the Ground at the Inauguration: The Only Th...,"Will Sennott\n\nWEDNESDAY, JANUARY 20, 2021, W...",left
1,"Under President Biden, Will the Yankees Return...",Thurman Munson and Reggie Jackson in 1977 From...,left
2,Gun Rights Absolutists Celebrate Martin Luther...,"Will Sennott\n\nMONDAY, JANUARY 18, 2021, RICH...",left
3,Thugs in Blue,"THE BEAT GOES ON … AND ON\n\nOnce Again, Polic...",left
4,HELL YEAH! Sheriff Clark Publicly DISEMBOWELS ...,Al Sharpton always has had a couple screws loo...,right
...,...,...,...
1649,UK Educators Rank-and-File Safety Committee di...,The UK Educators Rank-and-File Safety Committe...,left
1650,Make It Sing,Before I lay into the Democrats for missed opp...,left
1651,Bill Maher: The SPIN Interview,If you care at all about democracy and the way...,left
1652,Stephan Jenkins on What Culture Truly Means,"“When bad men combine, the good must associate...",left


### Text preprocessing

In [88]:
# Text preprocessing preparation

stop_words = ["the", "a", "an", "as", "this", "that", "is", "and", "or", "on",
              "at", "to", "in", "by", "than", "of", "for", "be", "i", "you", 
              "he", "she", "his", "her", "do", "it", "with"]

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

nltk.download('wordnet')

# required for tokenization
nltk.download('punkt')

# required for POS tagging
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [89]:
# Text preprocessing performed on both article headline and article body

def preprocess(sentence):

    # Lowercase
    sentence = sentence.lower()

    # Punctuation, whitespace removal
    punctuations = '''!()-—[]{};:'"“”‘’\,<>./?@#$%^&*_~'''
    whitespace = '''\n\t'''

    for ch in sentence: 
        if ch in punctuations: 
            sentence = sentence.replace(ch, "")
        if ch in whitespace:
            sentence = sentence.replace(ch, " ")

    # Stop word removal
    remaining_words = []
    
    for word in sentence.split():
        if word not in stop_words:
            remaining_words.append(word)

    sentence = " ".join(remaining_words)

    # Lemmatization
    lemmatized_words = []

    # In order to lemmatise we must first POS-tag each sentence
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)

    for word, tag in tagged:
        pos = nltk_tag_to_wordnet_tag(tag) 
        if pos is not None:
            word = lemmatizer.lemmatize(word, pos=pos)

        lemmatized_words.append(word)

    sentence = " ".join(lemmatized_words)
    
    return sentence

In [90]:
start = time.time()
all_data_df['article headline'] = all_data_df['article headline'].apply(preprocess)
print(f"Preprocessing headlines took {time.time() - start} seconds")

start = time.time()
all_data_df['article body'] = all_data_df['article body'].apply(preprocess)
print(f"Preprocessing article bodies took {time.time() - start} seconds")

Preprocessing headlines took 1.429811716079712 seconds
Preprocessing article bodies took 66.38098430633545 seconds


In [91]:
all_data_df

Unnamed: 0,article headline,article body,bias
0,ground inauguration only thing see be hope,will sennott wednesday january 20 2021 washing...,left
1,under president biden will yankees return thei...,thurman munson reggie jackson 1977 from villag...,left
2,gun right absolutists celebrate martin luther ...,will sennott monday january 18 2021 richmond v...,left
3,thug blue,beat go … once again police pummel plan reform...,left
4,hell yeah sheriff clark publicly disembowel ra...,al sharpton always have have couple screw loos...,right
...,...,...,...
1649,uk educator rankandfile safety committee discu...,uk educator rankandfile safety committee meeti...,left
1650,make sing,before lay into democrat miss opportunity hous...,left
1651,bill maher spin interview,if care all about democracy way our world work...,left
1652,stephan jenkins what culture truly mean,when bad men combine good must associate else ...,left


### Splitting data into folds

In [92]:
# 5 folds, each with 70% training, 10% validation, 20% train

num_folds = 5

fold_size = round(len(all_data_df) / num_folds)
fold_dfs = [all_data_df.iloc[i*fold_size:(i+1)*fold_size].copy() for i in range(num_folds)]

In [93]:
folds = {}

for i, df in enumerate(fold_dfs):
    folds[i] = {}
    split_point_1 = int(0.7*len(df))
    split_point_2 = int(0.8*len(df))
    folds[i]["train_df"] = df.iloc[:split_point_1].copy()
    folds[i]["val_df"] = df.iloc[split_point_1:split_point_2].copy()
    folds[i]["test_df"] = df.iloc[split_point_2:].copy()

In [94]:
print(f"Number of folds: {num_folds}")
print(f"Size of each training set: {len(folds[0]['train_df'])}")
print(f"Size of each validation set: {len(folds[0]['val_df'])}")
print(f"Size of each test set: {len(folds[0]['test_df'])}")

Number of folds: 5
Size of each training set: 231
Size of each validation set: 33
Size of each test set: 67


### BERT model

In [95]:
class BertDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [96]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [97]:
# Compute the length of the longest sentence in particular column out of
# all train, val and test data
def compute_max_length(col_to_encode):

  sentences = all_data_df[col_to_encode].values

  max_len = 0

  for sent in sentences:

      # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
      input_ids = tokenizer.encode(sent, add_special_tokens=True)

      # Update the maximum sentence length.
      max_len = max(max_len, len(input_ids))

  return max_len

In [98]:
max_len_headline = compute_max_length('article headline')
max_len_body = compute_max_length('article body')

print(f"Max headline length across all folds: {max_len_headline}")
print(f"Max article body length across all folds: {max_len_body}")

Token indices sequence length is longer than the specified maximum sequence length for this model (910 > 512). Running this sequence through the model will result in indexing errors


Max headline length across all folds: 103
Max article body length across all folds: 14373


In [122]:
datasets = {}

def create_bert_dataset(fold, df, col_to_encode, max_sequence_len):
    # Returns a BertDataset of sequences in col_to_encode column of df, for
    # given fold

    sentences = df[col_to_encode].values.tolist()
    encodings = tokenizer(sentences,
                          add_special_tokens=True,
                          max_length=max_sequence_len,
                          padding='max_length',
                          truncation=True,
                          return_attention_mask = True,
                          return_tensors = 'pt'
                          )
    return BertDataset(encodings, df['bias'].values)

datasets['headlines'] = {}
for i in range(num_folds):
    datasets['headlines'][i] = {}
    datasets['headlines'][i]['train'] = create_bert_dataset(i, folds[i]['train_df'],
                                                            'article headline',
                                                            max_len_headline)
    datasets['headlines'][i]['val'] = create_bert_dataset(i, folds[i]['val_df'],
                                                            'article headline',
                                                            max_len_headline)
    datasets['headlines'][i]['test'] = create_bert_dataset(i, folds[i]['test_df'],
                                                            'article headline',
                                                            max_len_headline)

datasets['bodies'] = {}
for i in range(num_folds):
    datasets['bodies'][i] = {}
    datasets['bodies'][i]['train'] = create_bert_dataset(i, folds[i]['train_df'],
                                                            'article body',
                                                            max_len_body)
    datasets['bodies'][i]['val'] = create_bert_dataset(i, folds[i]['val_df'],
                                                            'article body',
                                                            max_len_body)
    datasets['bodies'][i]['test'] = create_bert_dataset(i, folds[i]['test_df'],
                                                            'article body',
                                                            max_len_body)