In [None]:
%cd /data
%pip install keybert

In [None]:
import random

import pandas as pd
import numpy as np
import re

from sklearn.utils import shuffle
import string
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import time
from datetime import datetime
import ast
from torch.nn.utils.rnn import pad_sequence
from keybert import KeyBERT

from tqdm.notebook import tqdm

In [None]:
kwd_model = KeyBERT()
bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def bert_encoder(text):
    """ Compute semantic vector with BERT
    Parameters
    ----------
    seq: string to encode

    Returns
    -------
        np array
    """
    words = text.split(" ")
    # words = [word for word in words if word in bert_tokenizer.vocab.keys()]
    if len(words) > 2048:
        words = words[:2048]
    n_words = int(np.log2(len(words)))
    words = " ".join(words)
    keywords = kwd_model.extract_keywords(words, keyphrase_ngram_range=(1, 3), top_n=n_words)
    # keywords2 = kwd_model.extract_keywords(words, keyphrase_ngram_range=(2, 2), top_n=n_words)
    # keywords3 = kwd_model.extract_keywords(words, keyphrase_ngram_range=(3, 3), top_n=n_words)
    keywords = [word[0] for word in keywords]
    # keywords = list(set(keywords))
    s = " ".join(keywords)
    s = s + " " + text
    tokens = bert_tokenizer(s, return_tensors='pt', padding=True, max_length=64, truncation=True)
    return tokens

In [None]:
def clean(seq):
    """ Preprocess sentences for BERT
    Parameters
    ----------
    seq: str, raw sentence

    Returns
    -------
    str, preprocessed sentence
    """
    seq = re.sub('\]|\[|\)|\(|\=|\,|\;', ' ', seq)
    seq = " ".join([word.lower() if word.isupper() else word for word in seq.strip().split()])
    seq = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', seq))
    seq = " ".join([word for word in seq.split() if not bool(re.search(r'\d', word))])
    table = str.maketrans(dict.fromkeys(list(string.punctuation)))
    content = seq.translate(table)
    seq = " ".join([word.lower().strip() for word in content.strip().split()])
    return seq

In [None]:
def load_data(train, validation):
    """ Load HDFS unstructured log into train and test data
    Arguments
    ---------
        train: str, the file path of training resolutions.
        validation: str, the file path of validation resolutions.
    Returns
    -------
        (x_train, y_train): the training data
        (x_val, y_val): the validation data
    """
    encoder = bert_encoder

    #get data
    train_data = pd.read_csv(train)
    validation = pd.read_csv(validation)

    #convert training data into numpy array
    X_train = train_data['text']
    X_train = X_train.apply(ast.literal_eval)
    X_train = np.array(X_train)
    y_train = np.array(train_data['ideology'].tolist())

    #filter entries with no assigned ideology
    train_inds = np.where(np.isnan(y_train))[0]
    mask = np.ones(len(y_train), dtype=bool)
    mask[train_inds] = False
    y_train = y_train[mask]
    X_train = X_train[mask]

    #convert validatioin data into numpy array
    X_val = validation['text']
    X_val = X_val.apply(ast.literal_eval)
    X_val = np.array(X_val)
    y_val = np.array(validation['ideology'].tolist())

    #filter entries with no assigned ideology
    val_inds = np.where(np.isnan(y_val))[0]
    mask = np.ones(len(y_val), dtype=bool)
    mask[val_inds] = False
    y_val = y_val[mask]
    X_val = X_val[mask]

    #convert labels into binary encoding
    # y_train = np.around(y_train).astype(float)
    # y_val = np.around(y_val).astype(float)

    return X_train, y_train, X_val, y_val

In [None]:
X_train, y_t, X_val, y_v = load_data('train.csv', 'val.csv')

In [None]:
X_t = []
X_t_mask = []
#iterate through, clean, and tokenize X_train
for i, text in enumerate(X_train):
    seq = " ".join(text)
    s = seq.lower()
    token = bert_encoder(s)
    X_t.append(token['input_ids'].squeeze())
    X_t_mask.append(token['attention_mask'].squeeze())
    if i % 100 == 0:
        print(i)

X_v = []
X_v_mask = []
#iterate through, clean, and tokenize X_val
for i, text in enumerate(X_val):
    seq = " ".join(text)
    s = seq.lower()
    token = bert_encoder(s)
    X_v.append(token['input_ids'].squeeze())
    X_v_mask.append(token['attention_mask'].squeeze())

In [None]:
#pad list of tensors to the same size so they can be tensorized
X_t = pad_sequence(X_t, batch_first=True, padding_value=0)
X_t_mask = pad_sequence(X_t_mask, batch_first=True, padding_value=0)

X_v = pad_sequence(X_v, batch_first=True, padding_value=0)
X_v_mask = pad_sequence(X_v_mask, batch_first=True, padding_value=0)

In [None]:
#convert list of tokens to tensors
X_train = torch.tensor(X_t)
X_val = torch.tensor(X_v)

X_train_mask = torch.tensor(X_t_mask)
X_val_mask = torch.tensor(X_v_mask)

In [None]:
#convert labels to tensors
y_train = torch.tensor(y_t)
y_val = torch.tensor(y_v)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, TensorDataset, DataLoader
from sklearn.utils import shuffle

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_classes):
        super(TransformerClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, num_classes)  # BERT base model output size is 768
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(output.last_hidden_state, dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        logits = self.sigmoid(logits)
        return logits

In [None]:
model = TransformerClassifier(num_classes=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()

train_dataset = TensorDataset(X_train, X_train_mask, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(X_val, X_val_mask, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


for epoch in range(20):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, target = batch
        logits = model(input_ids, attention_mask)
        probs = logits[:,1].float()
        # target = torch.round(target)
        loss = loss_fn(probs, target.float())
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    val_acc = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, target = batch
            logits = model(input_ids, attention_mask)
            probs = logits[:,1].float()
            # target = torch.round(target)
            val_loss += loss_fn(probs, target.float()).item()
            val_acc += (logits.argmax(dim=1) == np.around(target)).sum().item()
    val_loss /= len(val_loader)
    val_acc /= len(val_loader.dataset)
    print(f"Epoch {epoch+1}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

In [None]:
def load_test_data(test):
    """ Load HDFS unstructured log into train and test data
    Arguments
    ---------
        test: str, the file path of test resolutions.
    Returns
    -------
        (x_test, y_test): the test data
    """
    encoder = bert_encoder

    #get data
    test_data = pd.read_csv(test)

    #convert training data into numpy array
    X_test = test_data['text']
    X_test = X_test.apply(ast.literal_eval)
    X_test = np.array(X_test)
    y_test = np.array(test_data['ideology'].tolist())

    #filter entries with no assigned ideology
    test_inds = np.where(np.isnan(y_test))[0]
    mask = np.ones(len(y_test), dtype=bool)
    mask[test_inds] = False
    y_test = y_test[mask]
    X_test = X_test[mask]

    #convert labels into binary encoding
    # y_train = np.around(y_train).astype(float)
    # y_val = np.around(y_val).astype(float)

    return X_test, y_test

In [None]:
X_t, y_test = load_test_data('test.csv')

In [None]:
X_test = []
X_test_mask = []
#iterate through, clean, and tokenize X_test
for i, text in enumerate(X_t):
    seq = " ".join(text)
    s = clean(seq).lower()
    token = bert_encoder(s)
    X_test.append(token['input_ids'].squeeze())
    X_test_mask.append(token['attention_mask'].squeeze())
    if i % 100 == 0:
        print(i)

In [None]:
X_test = pad_sequence(X_test, batch_first=True, padding_value=0)
X_test_mask = pad_sequence(X_test_mask, batch_first=True, padding_value=0)

In [None]:
X_test = torch.tensor(X_test)
X_test_mask = torch.tensor(X_test_mask)

In [None]:
y_test = torch.tensor(y_test)

In [None]:
test_dataset = TensorDataset(X_test, X_test_mask, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
preds = []
true = []
binary_preds = []
true_class = []
for batch in test_loader:
    input_ids, attention_mask, target = batch
    logits = model(input_ids, attention_mask)
    probs = logits[:,1].float()
    preds.extend(probs)
    true.extend(target)
    binary_preds.extend(logits.argmax(dim=1))
    true_class.extend(np.around(target))

In [None]:
def tensors_to_list(list_of_tensors):
    list_of_arrays = [round(tensor.item(), 4) for tensor in list_of_tensors]
    return list_of_arrays


In [None]:
preds = tensors_to_list(preds)
true = tensors_to_list(true)
binary_preds = tensors_to_list(binary_preds)
true_class = tensors_to_list(true_class)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true_class, binary_preds))