<a href="https://colab.research.google.com/github/fawazshah/News-Media-Reliability/blob/master/train_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 16.2MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 52.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 57.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=ce46

In [None]:
import json
import numpy as np
import pandas as pd
import random
import requests
from sklearn.metrics import f1_score, accuracy_score, classification_report
import time
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
# Setting random seed and device
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

### Utils

In [None]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = self._original_stdout

### Loading data

In [None]:
corpus_url = 'https://raw.githubusercontent.com/fawazshah/News-Media-Reliability/master/data/emnlp18/corpus-modified.tsv'

corpus = pd.read_csv(corpus_url, sep='\t')
urls = corpus['source_url_normalized'].values

# Ground truths
biases = corpus['bias'].values

In [None]:
article_data_json_url = 'https://raw.githubusercontent.com/fawazshah/News-Media-Reliability/master/data/scraped_articles.json'

r = requests.get(article_data_json_url)
article_data = r.json()

In [None]:
all_data_df = pd.DataFrame(columns=['article headline', 'article body', 'bias'])

news_sources_scraped = 0

for row in corpus.itertuples():
    url = row.source_url_normalized
    bias = row.bias
    if article_data["newspapers"][url] is not None:
        articles = article_data["newspapers"][url].get("articles", [])
        if len(articles) > 0:
            news_sources_scraped += 1
            for article in articles:
                all_data_df = all_data_df.append({'article headline': article['title'],
                                                  'article body': article['text'],
                                                  'bias': bias}, ignore_index=True)

In [None]:
all_data_df

Unnamed: 0,article headline,article body,bias
0,On the Ground at the Inauguration: The Only Th...,"Will Sennott\n\nWEDNESDAY, JANUARY 20, 2021, W...",left
1,"Under President Biden, Will the Yankees Return...",Thurman Munson and Reggie Jackson in 1977 From...,left
2,Gun Rights Absolutists Celebrate Martin Luther...,"Will Sennott\n\nMONDAY, JANUARY 18, 2021, RICH...",left
3,Thugs in Blue,"THE BEAT GOES ON … AND ON\n\nOnce Again, Polic...",left
4,HELL YEAH! Sheriff Clark Publicly DISEMBOWELS ...,Al Sharpton always has had a couple screws loo...,right
...,...,...,...
1649,UK Educators Rank-and-File Safety Committee di...,The UK Educators Rank-and-File Safety Committe...,left
1650,Make It Sing,Before I lay into the Democrats for missed opp...,left
1651,Bill Maher: The SPIN Interview,If you care at all about democracy and the way...,left
1652,Stephan Jenkins on What Culture Truly Means,"“When bad men combine, the good must associate...",left


### Text preprocessing

In [None]:
# Text preprocessing preparation

stop_words = ["the", "a", "an", "as", "this", "that", "is", "and", "or", "on",
              "at", "to", "in", "by", "than", "of", "for", "be", "i", "you", 
              "he", "she", "his", "her", "do", "it", "with"]

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

nltk.download('wordnet')

# required for tokenization
nltk.download('punkt')

# required for POS tagging
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Text preprocessing performed on both article headline and article body

def preprocess(sentence):

    # Lowercase
    sentence = sentence.lower()

    # Punctuation, whitespace removal
    punctuations = '''!()-—[]{};:'"“”‘’\,<>./?@#$%^&*_~'''
    whitespace = '''\n\t'''

    for ch in sentence: 
        if ch in punctuations: 
            sentence = sentence.replace(ch, "")
        if ch in whitespace:
            sentence = sentence.replace(ch, " ")

    # Stop word removal
    remaining_words = []
    
    for word in sentence.split():
        if word not in stop_words:
            remaining_words.append(word)

    sentence = " ".join(remaining_words)

    # Lemmatization
    lemmatized_words = []

    # In order to lemmatise we must first POS-tag each sentence
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)

    for word, tag in tagged:
        pos = nltk_tag_to_wordnet_tag(tag) 
        if pos is not None:
            word = lemmatizer.lemmatize(word, pos=pos)

        lemmatized_words.append(word)

    sentence = " ".join(lemmatized_words)
    
    return sentence

In [None]:
start = time.time()
all_data_df['article headline'] = all_data_df['article headline'].apply(preprocess)
print(f"Preprocessing headlines took {time.time() - start} seconds")

start = time.time()
all_data_df['article body'] = all_data_df['article body'].apply(preprocess)
print(f"Preprocessing article bodies took {time.time() - start} seconds")

Preprocessing headlines took 1.21817946434021 seconds
Preprocessing article bodies took 56.07073354721069 seconds


In [None]:
# Encode labels as numbers
# center == 0
# left == 1
# right == 2

def encode_labels(label):
    if label == "center":
        return 0
    elif label == "left":
        return 1
    else:
        return 2

all_data_df['bias'] = all_data_df['bias'].apply(encode_labels)

### Resampling

In [None]:
RESAMPLE = False

In [None]:
# Distribution before
print(all_data_df['bias'].value_counts())

2    657
0    567
1    430
Name: bias, dtype: int64


In [None]:
count_2, count_0, count_1 = all_data_df['bias'].value_counts()

In [None]:
all_data_0 = all_data_df[all_data_df['bias'] == 0]
all_data_1 = all_data_df[all_data_df['bias'] == 1]
all_data_2 = all_data_df[all_data_df['bias'] == 2]

# Undersample class 0 and 2 to match class 1

all_data_0_under = all_data_0.sample(count_1)
all_data_2_under = all_data_2.sample(count_1)

if RESAMPLE:
    all_data_df = pd.concat([all_data_1, all_data_0_under, all_data_2_under])

In [None]:
# Distribution after
print(all_data_df['bias'].value_counts())

2    430
1    430
0    430
Name: bias, dtype: int64


In [None]:
# Shuffle rows in dataset to mix classes up again
all_data_df = all_data_df.sample(frac=1, random_state=1)
all_data_df.reset_index(drop=True)

Unnamed: 0,article headline,article body,bias
0,ann coulter reveals way trump can bypass congr...,conservative firebrand ann coulter have never ...,2
1,india seventh index country impact climate cha...,damage house aftermath cyclone fani penthakata...,0
2,immigration detainee fear they could die behin...,campaigner say immigration detainee they be su...,1
3,gun store say wont sell firearm ammunition bid...,missouri gun store turn more few head after an...,2
4,octopuslike creature inhabit jupiter moon clai...,image base logarithmic map universe put togeth...,0
...,...,...,...
1285,va expand vaccination some nonhealth care staf...,veteran affair department have begin vaccinate...,0
1286,just another wordpress site,welcome wordpress your first post edit delete ...,2
1287,now supporters cancel culture be be cancel,freeze peach have long be mock infantile cry m...,2
1288,seth rogen ted cruz be each others throats twi...,sponsored link profane twitter feud between se...,1


### Selecting beginning vs middle vs end of body text

In [None]:
def get_middle_512(sentence):
    toks = sentence.split()
    midpoint = len(toks) / 512
    return ' '.join(toks[midpoint:midpoint+512])

In [5]:
def get_end_512(sentence):
    toks = sentence.split()
    return ' '.join(toks[-512:])

In [None]:
# "BEGINNING" is true by default i.e. BERT will just truncate the body to the first 512 tokens
MIDDLE = False
END = False

if MIDDLE:
    all_data_df['article body'] = all_data_df['article body'].apply(get_middle_512)
elif MIDDLE:
    all_data_df['article body'] = all_data_df['article body'].apply(get_end_512)

### Counting frequency of names in dataset

In [None]:
print(all_data_df['article headline'].str.count("biden").sum())
print(all_data_df['article body'].str.count("biden").sum())

112
1173


In [None]:
print(all_data_df['article headline'].str.count("trump").sum())
print(all_data_df['article body'].str.count("trump").sum())

171
2405


### Split data into train/val/test

In [None]:
# Train/val/test split

TRAIN = 0.7
VAL = 0.1
TEST = 0.2

In [None]:
split_point_1 = int(TRAIN*len(all_data_df))
split_point_2 = int((TRAIN+VAL)*len(all_data_df))

all_data_train_df = all_data_df.iloc[:split_point_1].copy()
all_data_val_df = all_data_df.iloc[split_point_1:split_point_2].copy()
all_data_test_df = all_data_df.iloc[split_point_2:].copy()

In [None]:
print(f"Size of training set: {len(all_data_train_df)}")
print(f"Size of validation set: {len(all_data_val_df)}")
print(f"Size of test set: {len(all_data_test_df)}")

Size of training set: 902
Size of validation set: 130
Size of test set: 258


### BERT setup

In [None]:
loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
# Compute the length of the longest sentence in particular column out of
# all train, val and test data
def compute_max_length(df, bert_input_func):

  sentences = bert_input_func(df)

  max_len = 0

  for sent in sentences:

      # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
      input_ids = tokenizer.encode(sent, add_special_tokens=True)

      # Update the maximum sentence length.
      max_len = max(max_len, len(input_ids))

  return max_len

In [None]:
def compute_sentences_headline(df):
    return df['article headline'].values

def compute_sentences_body(df):
    return df['article body'].values

def compute_sentences_headline_body(df):
    return [a + " [SEP] " + b
        for a, b in 
        zip(
            df['article headline'].values, 
            df['article body'].values, 
        )]

BERT_INPUTS = [
    {
        'name': 'headlines',
        'func': compute_sentences_headline,
        'max_len': compute_max_length(all_data_df, compute_sentences_headline)
    },
    {
        'name': 'bodies',
        'func': compute_sentences_body,
        'max_len': 512 # Max body length is always longer than 512
    },
    {
        'name': 'headlines + bodies',
        'func': compute_sentences_headline_body,
        'max_len': 512
    },
]

In [None]:
def create_bert_dataset(df, bert_input_func, max_sequence_len):
    # Returns a TensorDataset of sequences extracted from df

    token_ids = []
    token_type_ids = [] # segment ids 
    attention_masks = []

    sentences = bert_input_func(df)

    for sent in sentences:
        encoding_dict = tokenizer(sent,
                                  add_special_tokens=True,
                                  max_length=max_sequence_len,
                                  padding='max_length',
                                  truncation=True,
                                  return_token_type_ids = True,
                                  return_attention_mask = True,
                                  return_tensors = 'pt'
                                  )
        token_ids.append(encoding_dict['input_ids'])
        token_type_ids.append(encoding_dict['token_type_ids'])
        attention_masks.append(encoding_dict['attention_mask'])
    
    token_ids = torch.cat(token_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['bias'].values)
    
    return TensorDataset(token_ids, token_type_ids, attention_masks, labels)

In [None]:
def train_BERT(train_dataloader, val_dataloader, model, number_epoch):

    train_loss = []
    valid_loss = []

    optimizer = AdamW(model.parameters(),
                    lr = 2e-5, 
                    eps = 1e-8 
                )

    # Create the learning rate scheduler.
    total_steps = len(train_dataloader) * number_epoch
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=total_steps)

    for epoch in range(1, number_epoch+1):

        # TRAINING

        time0 = time.time()

        model.train()

        epoch_train_loss = 0
        no_observations = 0
        epoch_train_predictions = []
        epoch_train_labels = []

        for batch in train_dataloader:

            # Each batch contains token ids, token type ids, attention masks and labels
            b_token_ids = batch[0].to(device)
            b_token_type_ids = batch[1].to(device)
            b_attention_masks = batch[2].to(device)
            b_labels = batch[3].to(device)

            no_observations = no_observations + b_labels.shape[0]
            
            output = model(b_token_ids, 
                    token_type_ids=b_token_type_ids, 
                    attention_mask=b_attention_masks, 
                    labels=b_labels)

            model.zero_grad()

            loss = output.loss
            logits = output.logits

            predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = b_labels.detach().cpu().numpy()
            epoch_train_predictions.extend(predictions)
            epoch_train_labels.extend(labels)

            loss.backward()
            # Clip the norm of the gradients to 1 to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step() 

            # Update the learning rate using the scheduler
            scheduler.step()  

            epoch_train_loss += loss.item()*b_labels.shape[0]

        epoch_train_loss, epoch_train_acc = epoch_train_loss / no_observations, accuracy_score(epoch_train_labels, epoch_train_predictions)

        # VALIDATION

        epoch_valid_loss, epoch_val_predictions, epoch_val_labels = evaluate_BERT(val_dataloader, model)
        epoch_valid_acc = accuracy_score(epoch_val_labels, epoch_val_predictions)

        # FINALLY

        print(f"Epoch took: {time.time() - time0}")

        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_train_loss:.2f} | Train Accuracy: {epoch_train_acc:.2f} | \
        Val. Loss: {epoch_valid_loss:.2f} | Val. Accuracy: {epoch_valid_acc:.2f} |')

        train_loss.append(epoch_train_loss)
        valid_loss.append(epoch_valid_loss)
    
    return train_loss, valid_loss

In [None]:
def evaluate_BERT(test_dataloader, model):

    model.eval()
    total_loss = 0
    no_observations = 0
    predictions_all = []
    labels_all = []

    with torch.no_grad():
        for batch in test_dataloader:
            b_token_ids = batch[0].to(device)
            b_token_type_ids = batch[1].to(device)
            b_attention_masks = batch[2].to(device)
            b_labels = batch[3].to(device)

            no_observations += b_labels.shape[0]
            output = model(b_token_ids, token_type_ids=b_token_type_ids, 
                                        attention_mask=b_attention_masks)
            logits = output.logits
            loss = loss_fn(logits, b_labels)

            predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = b_labels.detach().cpu().numpy()
            predictions_all.extend(predictions)
            labels_all.extend(labels)

            total_loss += loss.item()*b_labels.shape[0]
    
    return total_loss / no_observations, predictions_all, labels_all

### Create BERT dataloaders

In [None]:
dataloaders = {}
BATCH_SIZE = 10

In [None]:
# Note we manually shuffled the dataset earlier, so we can use SequentialSampler to
# sample instead of RandomSampler during training

for bert_input in BERT_INPUTS:

    # Using all data
    dataloaders[bert_input['name']] = {}
    dataloaders[bert_input['name']]['all_data'] = {}
    train_dataset = create_bert_dataset(all_data_train_df, bert_input['func'], bert_input['max_len'])
    dataloaders[bert_input['name']]['all_data']['train'] = DataLoader(train_dataset, sampler=SequentialSampler(train_dataset), batch_size=BATCH_SIZE)
    val_dataset = create_bert_dataset(all_data_val_df, bert_input['func'], bert_input['max_len'])
    dataloaders[bert_input['name']]['all_data']['val'] = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)
    test_dataset = create_bert_dataset(all_data_test_df, bert_input['func'], bert_input['max_len'])
    dataloaders[bert_input['name']]['all_data']['test'] = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=BATCH_SIZE)

### Run BERT models

In [None]:
NUM_EPOCHS = 4

In [None]:
# Evaluate BERT on non-folded data

for bert_input in BERT_INPUTS:

    print(f"---------")
    print(f"{bert_input['name'].upper()}")
    print(f"---------")

    # Set up a new BERT model
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=3,
        output_attentions = False,
        output_hidden_states = False,
    )
    model.cuda()

    # Train model
    train_dataloader = dataloaders[bert_input['name']]['all_data']['train']
    val_dataloader = dataloaders[bert_input['name']]['all_data']['val']
    train_loss, valid_loss = train_BERT(train_dataloader, val_dataloader, model, NUM_EPOCHS)

    # Test model
    test_dataloader = dataloaders[bert_input['name']]['all_data']['test']
    _, predictions, labels = evaluate_BERT(test_dataloader, model)

    print(classification_report(labels, predictions))


---------
HEADLINES
---------


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch took: 7.528555154800415
| Epoch: 01 | Train Loss: 1.09 | Train Accuracy: 0.39 |         Val. Loss: 1.05 | Val. Accuracy: 0.45 |
Epoch took: 7.3016510009765625
| Epoch: 02 | Train Loss: 0.96 | Train Accuracy: 0.55 |         Val. Loss: 1.08 | Val. Accuracy: 0.48 |
Epoch took: 7.324841737747192
| Epoch: 03 | Train Loss: 0.80 | Train Accuracy: 0.67 |         Val. Loss: 1.11 | Val. Accuracy: 0.45 |
Epoch took: 7.308145046234131
| Epoch: 04 | Train Loss: 0.66 | Train Accuracy: 0.77 |         Val. Loss: 1.11 | Val. Accuracy: 0.51 |
              precision    recall  f1-score   support

           0       0.60      0.54      0.57        91
           1       0.35      0.45      0.40        75
           2       0.56      0.49      0.52        92

    accuracy                           0.50       258
   macro avg       0.50      0.49      0.50       258
weighted avg       0.51      0.50      0.50       258

---------
BODIES
---------


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch took: 29.84273648262024
| Epoch: 01 | Train Loss: 1.09 | Train Accuracy: 0.40 |         Val. Loss: 1.02 | Val. Accuracy: 0.52 |
Epoch took: 29.863569498062134
| Epoch: 02 | Train Loss: 0.97 | Train Accuracy: 0.56 |         Val. Loss: 0.99 | Val. Accuracy: 0.54 |
Epoch took: 29.97337317466736
| Epoch: 03 | Train Loss: 0.82 | Train Accuracy: 0.66 |         Val. Loss: 0.95 | Val. Accuracy: 0.57 |
Epoch took: 30.008955001831055
| Epoch: 04 | Train Loss: 0.65 | Train Accuracy: 0.78 |         Val. Loss: 0.93 | Val. Accuracy: 0.58 |
              precision    recall  f1-score   support

           0       0.65      0.65      0.65        91
           1       0.47      0.49      0.48        75
           2       0.61      0.59      0.60        92

    accuracy                           0.58       258
   macro avg       0.58      0.58      0.58       258
weighted avg       0.58      0.58      0.58       258

---------
HEADLINES + BODIES
---------


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch took: 30.02953577041626
| Epoch: 01 | Train Loss: 1.08 | Train Accuracy: 0.41 |         Val. Loss: 1.01 | Val. Accuracy: 0.52 |
Epoch took: 30.032389879226685
| Epoch: 02 | Train Loss: 0.96 | Train Accuracy: 0.54 |         Val. Loss: 0.98 | Val. Accuracy: 0.53 |
Epoch took: 30.020155906677246
| Epoch: 03 | Train Loss: 0.79 | Train Accuracy: 0.66 |         Val. Loss: 0.95 | Val. Accuracy: 0.55 |
Epoch took: 30.04426670074463
| Epoch: 04 | Train Loss: 0.65 | Train Accuracy: 0.76 |         Val. Loss: 0.94 | Val. Accuracy: 0.62 |
              precision    recall  f1-score   support

           0       0.71      0.64      0.67        91
           1       0.42      0.52      0.47        75
           2       0.61      0.55      0.58        92

    accuracy                           0.57       258
   macro avg       0.58      0.57      0.57       258
weighted avg       0.59      0.57      0.58       258



### Examining left vs center vs right (not in report yet)

In [None]:
left_df = all_data_df[all_data_df['bias'] == 1]
center_df = all_data_df[all_data_df['bias'] == 0]
right_df = all_data_df[all_data_df['bias'] == 2]

In [None]:
# We train using article bodies since this gave highest accuracy in earlier 
# experiments

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

# Train model
train_dataloader = dataloaders['headlines']['all_data']['train']
val_dataloader = dataloaders['headlines']['all_data']['val']
train_loss, valid_loss = train_BERT(train_dataloader, val_dataloader, model, NUM_EPOCHS)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch took: 9.474628448486328
| Epoch: 01 | Train Loss: 1.07 | Train Accuracy: 0.42 |         Val. Loss: 1.02 | Val. Accuracy: 0.49 |
Epoch took: 9.492562055587769
| Epoch: 02 | Train Loss: 0.91 | Train Accuracy: 0.58 |         Val. Loss: 1.01 | Val. Accuracy: 0.54 |
Epoch took: 9.4595468044281
| Epoch: 03 | Train Loss: 0.70 | Train Accuracy: 0.72 |         Val. Loss: 1.08 | Val. Accuracy: 0.58 |
Epoch took: 9.436475992202759
| Epoch: 04 | Train Loss: 0.54 | Train Accuracy: 0.80 |         Val. Loss: 1.07 | Val. Accuracy: 0.60 |


In [None]:
# Create dataloaders for left_df, center_df, right_df, based on just headline text

left_dataset = create_bert_dataset(left_df, compute_sentences_headline, 512)
left_dataloader = DataLoader(left_dataset, sampler=RandomSampler(left_dataset), batch_size=BATCH_SIZE)

center_dataset = create_bert_dataset(center_df, compute_sentences_headline, 512)
center_dataloader = DataLoader(center_dataset, sampler=RandomSampler(center_dataset), batch_size=BATCH_SIZE)

right_dataset = create_bert_dataset(right_df, compute_sentences_headline, 512)
right_dataloader = DataLoader(right_dataset, sampler=RandomSampler(right_dataset), batch_size=BATCH_SIZE)

In [None]:
_, predictions, labels = evaluate_BERT(left_dataloader, model)
print(classification_report(labels, predictions))

_, predictions, labels = evaluate_BERT(center_dataloader, model)
print(classification_report(labels, predictions))

_, predictions, labels = evaluate_BERT(right_dataloader, model)
print(classification_report(labels, predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.57      0.73       430
           2       0.00      0.00      0.00         0

    accuracy                           0.57       430
   macro avg       0.33      0.19      0.24       430
weighted avg       1.00      0.57      0.73       430

              precision    recall  f1-score   support

           0       1.00      0.82      0.90       567
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.82       567
   macro avg       0.33      0.27      0.30       567
weighted avg       1.00      0.82      0.90       567

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      0.87      0.93       657

    accuracy        