In [None]:
# Install dependencies for Python libraries
!pip install -r https://raw.githubusercontent.com/dkharazi/bert-news/main/requirements.txt

In [None]:
# Import libraries for generic data preprocessing
import os
import numpy as np
import pandas as pd
from itertools import chain

# Import libraries for model selection and accuracy measures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Import BERT transformer libraries
from torch.utils.data import Dataset
from torch import tensor
from sentence_transformers import SentenceTransformer
from transformers import (
    DistilBertTokenizerFast,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
    )

### Set Random Seed

In [None]:
np.random.seed(10)
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

### Load Preprocessed Data

In [None]:
articles = pd.read_parquet('https://github.com/dkharazi/bert-news/blob/main/data/cluster_articles.gzip?raw=true')
articles = articles.reset_index(drop=True)

In [None]:
articles.head()

### Data Preprocessing

In [None]:
# Filter out any articles without definitive clusters
articles = articles[articles['cluster'] != -1].reset_index(drop=True)

### Initialize BERT Tokenizer and Classifier

In [None]:
# Initialize BERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Initialize BERT configurations
dist_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=25)

# Implement pre-trained BERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=dist_config).to('cuda')

### Tokenize Data using DistilBERT

In [None]:
# Tokenize articles
tokenized_articles = tokenizer(
    text=articles['content'].tolist(),
    return_tensors='pt',
    max_length=512,
    truncation=True,
    padding=True
    )

In [None]:
# Extract tokenized input IDs
pt_articles = tokenized_articles['input_ids']

# Reformat cluster column as tensor
pt_clusters = tensor(articles['cluster'])

### Split Data into Training and Test Tensors

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    pt_articles,
    pt_clusters,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=pt_clusters
    )

# Print lengths
print('Lengths: ', len(X_train), len(X_test), len(y_train), len(y_test))

# Classes are imbalanced, so we must oversample training data and undersample test data
print('Counts: ', Counter(pt_clusters.tolist()))

### Undersample Test Labels

In [None]:
# Reformat test tensor as numpy array
yt = y_test.cpu().detach().numpy()

# Get same number of randomly chosen test indices
idx = []
min_lab = min(Counter(yt).values())
for i in np.unique(yt):
    idx.extend(np.random.choice(np.where(yt==i)[0], min_lab, replace=False))

# Enforce similar frequency of labels in test data
X_test = X_test[idx]
y_test = y_test[idx]

In [None]:
# Print distribution of each label
print('Frequency of each label in entire dataset:')
print()
print(Counter(pt_clusters.tolist()))
print('---')
print('Frequency of each label in training set:')
print()
print(Counter(y_train.tolist()))
print('---')
print('Frequency of each label in test set:')
print()
print(Counter(y_test.tolist()))

### Synthetic Oversampling/Undersampling of Training Data

In [None]:
# Reformat training tensor as numpy array
yt = y_train.cpu().detach().numpy()

# Try undersampling to 1000 articles for each cluster
large_c = [c for c, count in Counter(y_train.tolist()).items() if count >= 1000]
small_c = [c for c, count in Counter(y_train.tolist()).items() if count < 1000]
idx = np.where(np.isin(yt, small_c))[0].tolist()
for i in large_c:
    idx.extend(np.random.choice(np.where(yt==i)[0], 1000, replace=False))

# Enforce similar frequency of labels in training data
X_train = X_train[idx]
y_train = y_train[idx]

# Print undersampled counts of each label
print(Counter(y_train.tolist()))

In [None]:
# Initialize SMOTEN object for oversampling nominal features
sampler = SMOTEN(random_state=12)

In [None]:
# Simulate oversampled data for imbalanced classes
X_res, y_res = sampler.fit_resample(X_train, y_train)

In [None]:
# Reformat numpy arrays as pytorch tensors
X_res, y_res = tensor(X_res).long(), tensor(y_res).long()

# Reassign tensors to GPU
X_train = X_train.to('cuda')
y_train = y_train.to('cuda')

In [None]:
# Print oversampled counts of each label
print(Counter(y_res.tolist()))

### Create PyTorch Dataset

In [None]:
# Crreate custom Dataset with articles
class ArticlesDataset(Dataset):
    def __init__(self, embeddings, clusters):
        self.encodings = embeddings
        self.labels = clusters

    def __getitem__(self, idx):
        return {'input_ids': self.encodings[idx], 'labels': self.labels[idx]}

    def __len__(self):
        return len(self.labels)

# Reformat training data as PyTorch Dataset
train_dataset = ArticlesDataset(X_res, y_res)

# Reformat test data as PyTorch Dataset
test_dataset = ArticlesDataset(X_test, y_test)

### Initialize BERT Evaluation

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Train BERT Classifier

In [None]:
# Initialize training configurations
training_args = TrainingArguments(
    output_dir='../bert_results',    # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='../bert_logs',      # logging directory
)

# Implement Trainer object for training on articles and clusters
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
# Train BERT
trainer.train()

### Evaluate SMOTEN BERT Classifier

In [None]:
trainer.evaluate()

In [None]:
# Grid Search and output model producing best accuracies using SMOTEN
# Then, compare with model oversampling

### Predict using BERT Classifier

In [None]:
print('prediction: ' + str(np.argmax(trainer.predict(test_dataset).predictions)))
print('actual: ' + str(trainer.predict(test_dataset).label_ids[0]))

### Potential Fixes

- Go back a remove this: [0:5]
    - Try this to fix the kernel issue https://stackoverflow.com/a/59949321/12777044
- Go back and expand test dataset from a single point: [0:1]