In [9]:
# Import libraries for generic data preprocessing
import os
import numpy as np
import pandas as pd
from itertools import chain

# Import libraries for model selection and accuracy measures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Import BERT transformer libraries
from torch.utils.data import Dataset
from torch import tensor
from sentence_transformers import SentenceTransformer
from transformers import (
    DistilBertConfig,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
    )

### Set Random Seed

In [2]:
np.random.seed(10)
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

### Load Preprocessed Data

In [3]:
articles = pd.read_parquet('../data/cluster_articles.gzip')
articles = articles.reset_index(drop=True)

In [4]:
articles.head()

Unnamed: 0,title,author,publication,content,party,cluster
0,Breitbart Launches ’Border Wall Construction C...,Milo,Breitbart,last weekend church confessed sin personal van...,right,-1
1,IDF Airstrike Eliminates 4 Islamic State-Linke...,Breitbart Jerusalem,Breitbart,times israel reports israeli airstrike killed ...,right,4
2,Oracle Funds Anti-Google Effort that Outs Hill...,Chriss W. Street,Breitbart,oracle corporation using deep financial resour...,right,13
3,Silicon Valley Urges Giving Election Day Off t...,Chriss W. Street,Breitbart,apparently worried populist movement led donal...,right,-1
4,Illegal Migrant Abandoned in Desert Calls 911 ...,Bob Price,Breitbart,severely dehydrated illegal alien called 911 p...,right,5


### Data Preprocessing

In [5]:
# Filter out any articles without definitive clusters
articles = articles[articles['cluster'] != -1].reset_index(drop=True)

### Tokenize Articles

In [6]:
# Load in pre-trained DistilBERT model
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark
tokenizer = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [7]:
# Encode embeddings
embeddings = tokenizer.encode(articles['content'], show_progress_bar=True)

Batches:   0%|          | 0/901 [00:00<?, ?it/s]

### Standardize Embeddings

In [10]:
# Perform mean-centering standardization on embeddings
std_embeddings = StandardScaler(with_mean=True).fit_transform(embeddings)

### Split Data into Training and Test Tensors

In [50]:
# Reformat embeddings as tensors
pt_embeddings = tensor(std_embeddings)

# Reformat cluster labels as tensors
pt_cluster = tensor(articles['cluster'])

In [52]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    pt_embeddings,
    pt_cluster,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=articles['cluster']
    )

# Print lengths
print(len(X_train), len(X_test), len(y_train), len(y_test))

24498 4324 24498 4324


### Initialize BERT Classifier

In [6]:
# Initialize BERT configurations
dist_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=25)

# Implement pre-trained BERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=dist_config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

### Create PyTorch Dataset

In [41]:
# Crreate custom Dataset with articles
class ArticlesDataset(Dataset):
    def __init__(self, embeddings, clusters):
        self.encodings = embeddings
        self.labels = clusters

    def __getitem__(self, idx):
        return {'input_ids': self.encodings[idx], 'labels': self.labels[idx]}

    def __len__(self):
        return len(self.labels)

# Reformat training data as PyTorch Dataset
train_dataset = ArticlesDataset(X_train['input_ids'][0:5], y_train[0:5])

# Reformat test data as PyTorch Dataset
test_dataset = ArticlesDataset(X_test['input_ids'][0:1], y_test[0:1])

### Initialize BERT Evaluation

In [36]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Train BERT Classifier

In [42]:
# Initialize training configurations
training_args = TrainingArguments(
    output_dir='../bert_results',    # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='../bert_logs',      # logging directory
)

# Implement Trainer object for training on articles and clusters
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [43]:
# Train BERT
trainer.train()

Step,Training Loss


TrainOutput(global_step=3, training_loss=3.1354649861653647, metrics={'train_runtime': 18.0521, 'train_samples_per_second': 0.166, 'total_flos': 3086101877760.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 77385, 'init_mem_cpu_peaked_delta': 216227, 'train_mem_cpu_alloc_delta': 60733, 'train_mem_cpu_peaked_delta': 57912})

### Evaluate BERT Classifier

In [44]:
trainer.evaluate()

{'eval_loss': 3.0750811100006104,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_runtime': 0.3939,
 'eval_samples_per_second': 2.539,
 'epoch': 3.0,
 'eval_mem_cpu_alloc_delta': 8895,
 'eval_mem_cpu_peaked_delta': 20619}

### Predict using BERT Classifier

In [57]:
print('prediction: ' + str(np.argmax(trainer.predict(test_dataset).predictions)))
print('actual: ' + str(trainer.predict(test_dataset).label_ids[0]))

prediction: 12
actual: 12


### Potential Fixes

- Go back a remove this: [0:5]
    - Try this to fix the kernel issue https://stackoverflow.com/a/59949321/12777044
- Go back and expand test dataset from a single point: [0:1]