In [1]:
# Import libraries for generic data preprocessing
import os
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter
from imblearn.over_sampling import SMOTEN

# Import libraries for model selection and accuracy measures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Import BERT transformer libraries
from torch.utils.data import Dataset
from torch import tensor
import torch
from transformers import (
    DistilBertTokenizerFast,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
    )

### Set Random Seed

In [2]:
np.random.seed(10)
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

### Load Preprocessed Data

In [3]:
articles = pd.read_parquet('../data/cluster_articles.gzip')
articles = articles.reset_index(drop=True)

In [4]:
articles.head()

Unnamed: 0,title,author,publication,content,party,cluster
0,Breitbart Launches ’Border Wall Construction C...,Milo,Breitbart,last weekend church confessed sin personal van...,right,-1
1,IDF Airstrike Eliminates 4 Islamic State-Linke...,Breitbart Jerusalem,Breitbart,times israel reports israeli airstrike killed ...,right,4
2,Oracle Funds Anti-Google Effort that Outs Hill...,Chriss W. Street,Breitbart,oracle corporation using deep financial resour...,right,13
3,Silicon Valley Urges Giving Election Day Off t...,Chriss W. Street,Breitbart,apparently worried populist movement led donal...,right,-1
4,Illegal Migrant Abandoned in Desert Calls 911 ...,Bob Price,Breitbart,severely dehydrated illegal alien called 911 p...,right,5


### Data Preprocessing

In [5]:
# Filter out any articles without definitive clusters
articles = articles[articles['cluster'] != -1].reset_index(drop=True)

### Initialize BERT Tokenizer and Classifier

In [6]:
# Initialize BERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Initialize BERT configurations
dist_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=25)

# Implement pre-trained BERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=dist_config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

### Tokenize Data using DistilBERT

In [7]:
# Tokenize articles
tokenized_articles = tokenizer(
    text=articles['content'].tolist(),
    return_tensors='pt',
    max_length=512,
    truncation=True,
    padding=True
    )

In [8]:
# Extract tokenized input IDs
pt_articles = tokenized_articles['input_ids']

# Reformat cluster column as tensor
pt_clusters = tensor(articles['cluster'])

### Split Data into Training and Test Tensors

In [9]:
# Split data and stratify
X_train, X_test, y_train, y_test = train_test_split(
    pt_articles,
    pt_clusters,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=pt_clusters
    )

# Print lengths
print('Lengths: ', len(X_train), len(X_test), len(y_train), len(y_test))

# Classes are imbalanced, so we must oversample training data and undersample test data
print('Counts: ', Counter(pt_clusters.tolist()))

Lengths:  24498 4324 24498 4324
Counts:  Counter({13: 12238, 5: 4002, 4: 1969, 16: 1399, 24: 1127, 18: 1079, 1: 786, 22: 732, 23: 650, 10: 571, 7: 530, 0: 457, 3: 371, 11: 350, 17: 347, 21: 339, 9: 269, 8: 261, 15: 252, 6: 249, 14: 239, 19: 188, 20: 184, 2: 120, 12: 113})


### Undersample Test Labels

In [10]:
# Reformat test tensor as numpy array
yt = y_test.cpu().detach().numpy()

# Get same number of randomly chosen test indices
idx = []
min_lab = min(Counter(yt).values())
for i in np.unique(yt):
    idx.extend(np.random.choice(np.where(yt==i)[0], min_lab, replace=False))

# Enforce similar frequency of labels in test data
X_test = X_test[idx]
y_test = y_test[idx]

In [11]:
# Print distribution of each label
print('Frequency of each label in entire dataset:')
print()
print(Counter(pt_clusters.tolist()))
print('---')
print('Frequency of each label in training set:')
print()
print(Counter(y_train.tolist()))
print('---')
print('Frequency of each label in test set:')
print()
print(Counter(y_test.tolist()))

Frequency of each label in entire dataset:

Counter({13: 12238, 5: 4002, 4: 1969, 16: 1399, 24: 1127, 18: 1079, 1: 786, 22: 732, 23: 650, 10: 571, 7: 530, 0: 457, 3: 371, 11: 350, 17: 347, 21: 339, 9: 269, 8: 261, 15: 252, 6: 249, 14: 239, 19: 188, 20: 184, 2: 120, 12: 113})
---
Frequency of each label in training set:

Counter({13: 10402, 5: 3402, 4: 1674, 16: 1189, 24: 958, 18: 917, 1: 668, 22: 622, 23: 552, 10: 485, 7: 451, 0: 388, 3: 315, 11: 298, 17: 295, 21: 288, 9: 229, 8: 222, 15: 214, 6: 212, 14: 203, 19: 160, 20: 156, 2: 102, 12: 96})
---
Frequency of each label in test set:

Counter({0: 17, 1: 17, 2: 17, 3: 17, 4: 17, 5: 17, 6: 17, 7: 17, 8: 17, 9: 17, 10: 17, 11: 17, 12: 17, 13: 17, 14: 17, 15: 17, 16: 17, 17: 17, 18: 17, 19: 17, 20: 17, 21: 17, 22: 17, 23: 17, 24: 17})


### Synthetic Oversampling/Undersampling of Training Data

In [12]:
# First, undersample the outlandish number of poltical articles
print(Counter(y_train.tolist()))

Counter({13: 10402, 5: 3402, 4: 1674, 16: 1189, 24: 958, 18: 917, 1: 668, 22: 622, 23: 552, 10: 485, 7: 451, 0: 388, 3: 315, 11: 298, 17: 295, 21: 288, 9: 229, 8: 222, 15: 214, 6: 212, 14: 203, 19: 160, 20: 156, 2: 102, 12: 96})


In [13]:
# Reformat training tensor as numpy array
yt = y_train.cpu().detach().numpy()

# Try undersampling to 1000 articles for each cluster
large_c = [c for c, count in Counter(y_train.tolist()).items() if count >= 1000]
small_c = [c for c, count in Counter(y_train.tolist()).items() if count < 1000]
idx = np.where(np.isin(yt, small_c))[0].tolist()
for i in large_c:
    idx.extend(np.random.choice(np.where(yt==i)[0], 1000, replace=False))

# Enforce similar frequency of labels in training data
X_train = X_train[idx]
y_train = y_train[idx]

# Print undersampled counts of each label
print(Counter(y_train.tolist()))

Counter({13: 1000, 5: 1000, 4: 1000, 16: 1000, 24: 958, 18: 917, 1: 668, 22: 622, 23: 552, 10: 485, 7: 451, 0: 388, 3: 315, 11: 298, 17: 295, 21: 288, 9: 229, 8: 222, 15: 214, 6: 212, 14: 203, 19: 160, 20: 156, 2: 102, 12: 96})


In [14]:
# Initialize SMOTEN object for oversampling nominal features
sampler = SMOTEN(random_state=12)

In [15]:
# Simulate oversampled data for imbalanced classes
X_res, y_res = sampler.fit_resample(X_train, y_train)

In [16]:
# Reformat numpy arrays as pytorch tensors
X_res, y_res = tensor(X_res).long(), tensor(y_res).long()

In [17]:
# Print oversampled counts of each label
print(Counter(y_res.tolist()))

Counter({11: 1000, 17: 1000, 22: 1000, 6: 1000, 18: 1000, 10: 1000, 1: 1000, 24: 1000, 15: 1000, 21: 1000, 3: 1000, 20: 1000, 7: 1000, 23: 1000, 19: 1000, 8: 1000, 14: 1000, 0: 1000, 12: 1000, 9: 1000, 2: 1000, 13: 1000, 5: 1000, 4: 1000, 16: 1000})


### Create PyTorch Dataset

In [18]:
# Create custom Dataset with articles
class ArticlesDataset(Dataset):
    def __init__(self, embeddings, clusters):
        self.encodings = embeddings
        self.labels = clusters

    def __getitem__(self, idx):
        return {'input_ids': self.encodings[idx], 'labels': self.labels[idx]}

    def __len__(self):
        return len(self.labels)

# Reformat training data as PyTorch Dataset
train_dataset = ArticlesDataset(X_res, y_res)

# Reformat test data as PyTorch Dataset
test_dataset = ArticlesDataset(X_test, y_test)

### Initialize BERT Evaluation Metrics

In [19]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Train BERT Classifier

In [20]:
# Initialize training configurations
training_args = TrainingArguments(
    output_dir='../bert_results',    # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='../bert_logs',      # logging directory
)

# Implement Trainer object for training on articles and clusters
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [1]:
# Train BERT
trainer.train()

NameError: name 'trainer' is not defined

### Evaluate SMOTE

In [None]:
# Evaluate SMOTEN data
trainer.evaluate()

In [None]:
print('prediction: ' + str(np.argmax(trainer.predict(test_dataset).predictions)))
print('actual: ' + str(trainer.predict(test_dataset).label_ids[0]))

In [None]:
# Grid Search and output model producing best accuracies using SMOTEN
# Then, compare with model oversampling