In [3]:
import random as python_random
import json
import argparse
import numpy as np
from itertools import product
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.initializers import Constant
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import TextVectorization
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay, CosineDecayRestarts
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
import tensorflow as tf

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Col

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
def read_corpus(corpus_file):
    """Read in review data set and returns docs and labels"""
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            tokens = line.strip()
            documents.append(" ".join(tokens.split()[:-1]).strip())
            labels.append(tokens.split()[-1])
    return documents, labels

In [7]:
# Read in the data and embeddings
X_train, Y_train = read_corpus('/content/gdrive/MyDrive/Colab Notebooks/ja/train.tsv')
X_dev, Y_dev = read_corpus('/content/gdrive/MyDrive/Colab Notebooks/ja/dev.tsv')
X_test, Y_test = read_corpus('/content/gdrive/MyDrive/Colab Notebooks/ja/test.tsv')
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# Transform string labels to one-hot encodings
encoder = LabelBinarizer()
Y_train_bin = encoder.fit_transform(Y_train)  # Use encoder.classes_ to find mapping back
Y_dev_bin = encoder.fit_transform(Y_dev)
Y_test_bin = encoder.fit_transform(Y_test)
labels = encoder.classes_

# Transform the labels so it can be finetuned properly
Y_train_bin = np.hstack((1 - Y_train_bin, Y_train_bin))

In [8]:
def compute_word_frequency(X_train_vect, vocabulary):
  ''' Compute word frequency in the given vocabulary'''
  word_counts = {word: 0 for word in vocabulary}
  for sentence in X_train_vect:
    for word_index in sentence:
      if word_index != 0: # If the word exists in the vocabulary increment the occurence count
        word_counts[vocabulary[word_index]] += 1
  return word_counts

In [9]:
def compute_word_frequency_in_offensive_instances(X_train_vect, y_pred_test, vocabulary):
  ''' Compute the word frequency in offensive samples'''
  total_offensive_classifications = sum([1 if predicted_label == [1] else 0 for predicted_label in y_pred_test])
  word_counts = {word: 0 for word in vocabulary}
  for (sentence, predicted_label) in zip(X_train_vect, y_pred_test):
    if predicted_label == [1]: # If the data instance is predicted as offensive add it's o
      for word_index in sentence:
        if word_index != 0:
          word_counts[vocabulary[word_index]] += (1)
  return {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}

In [10]:
def compute_normalized_word_frequency_in_offensive_instances(word_frequency, word_frequency_in_offensive_instances, voc, support_threshold):
  normalized_word_frequency_in_offensive_instances = {}
  for word in voc:
    if word_frequency[word] != 0 and word_frequency[word] >= support_threshold:
      normalized_word_frequency_in_offensive_instances[word] = word_frequency_in_offensive_instances[word] / word_frequency[word]
  return {k: v for k, v in sorted(normalized_word_frequency_in_offensive_instances.items(), key=lambda item: item[1], reverse=True)}

In [11]:
def compute_offensiveness_metric(X, y_pred, voc):
  word_frequency = compute_word_frequency(X, voc)
  word_frequency_in_offensive_instances = compute_word_frequency_in_offensive_instances(X, y_pred, voc)
  normalized_word_frequency_in_offensive_instances = compute_normalized_word_frequency_in_offensive_instances(word_frequency, word_frequency_in_offensive_instances, voc, 10)
  return normalized_word_frequency_in_offensive_instances

In [12]:
def classify_based_on_word_list(X, word_list):
  y_pred = []
  for sentence in X:
    prediction = [0]
    for word_token in sentence:
      if word_token in word_list:
        prediction = [1]
    y_pred.append(prediction)
  return y_pred

In [13]:
def train_model(lm, tokens_train, Y_train_bin, num_labels, epochs, batch_size, learning_rate):
    ''' Train the model '''
    print("Loading model....")
    model = TFAutoModelForSequenceClassification.from_pretrained(lm, num_labels=num_labels)
    loss_function = BinaryCrossentropy(from_logits=True)
    num_decay_steps = len(Y_train_bin) * epochs
    if learning_rate == "PolynomialDecay":
        lr_scheduler = PolynomialDecay(
            initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_decay_steps
        )
    elif learning_rate == "CosineDecay":
        lr_scheduler = CosineDecay(
            initial_learning_rate=5e-5, decay_steps = num_decay_steps
        )
    else:
        lr_scheduler = learning_rate
    optim = Adam(learning_rate=lr_scheduler)
    print("Training model....")
    model.compile(loss=loss_function, optimizer=optim, metrics=['accuracy'])
    model.fit(tokens_train, Y_train_bin, verbose=1, epochs=epochs,
              batch_size=batch_size)
    print("Done!")
    return model

In [14]:
def evaluate_model(lm, tokens_dev, Y_dev_bin, labels):
    ''' Evaluate the model on the dev set'''
    print("Evaluating model....")
    pred = lm.predict(tokens_dev)["logits"]
    # Get predictions using the trained model
    # Finally, convert to numerical labels to get scores with sklearn
    Y_pred = np.argmax(pred, axis=1)
    # If you have gold data, you can calculate accuracy
    Y_test = np.argmax(Y_dev_bin, axis=1)

    report = classification_report(Y_test, Y_pred, target_names=labels, digits=3)
    print(report)
    cm = confusion_matrix(Y_test, Y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()
    return accuracy_score(Y_test, Y_pred), f1_score(Y_test, Y_pred, average='macro')

In [21]:
def create_param_grid():
    ''' Create a parameter grid '''
    param_grid = {'epochs': [1, 2, 3], 'max_seq_len': [50, 100, 150],
                  'batches': [16, 32, 64],
                  'lr_schedulers': ["PolynomialDecay", "CosineDecay", 5e-5, 3e-5]}
    keys, values = zip(*param_grid.items())
    result = [dict(zip(keys, p)) for p in product(*values)]
    return result

In [22]:
# Perform the grid search
lm = 'bert-base-uncased'
param_grid = create_param_grid()
performances = []
i = 0
seeds = [1234]
for seed in seeds:
    for parameters in param_grid:
        tf.keras.backend.clear_session()
        np.random.seed(seed)
        tf.random.set_seed(seed)
        python_random.seed(seed)
        tokenizer = AutoTokenizer.from_pretrained(lm)
        tokens_train = tokenizer(X_train, padding=True, max_length=parameters['max_seq_len'],
                                  truncation=True, return_tensors="np").data
        tokens_dev = tokenizer(X_dev, padding=True, max_length=parameters['max_seq_len'],
                                truncation=True, return_tensors="np").data
        tokens_test = tokenizer(X_test, padding=True, max_length=parameters['max_seq_len'],
                                truncation=True, return_tensors="np").data
        model = train_model(lm, tokens_train, Y_train_bin,  len(labels),
                            epochs=parameters['epochs'], batch_size=parameters['batches'], learning_rate=parameters['lr_schedulers'])

        print(parameters)
        acc, f1 = evaluate_model(model, tokens_dev, Y_dev_bin, labels)
        performances.append(acc)
        if i == 0:
            parameters['accuracy'] = [acc]
            parameters['f1'] = [f1]
        else:
            parameters['accuracy'].append(acc)
            parameters['f1'].append(f1)
    i += 1

In [24]:
def return_best_model():
  lm = 'bert-base-uncased'
  tokenizer = AutoTokenizer.from_pretrained(lm)
  tokens_train = tokenizer(X_train, padding=True, max_length=100,
                            truncation=True, return_tensors="np").data
  tokens_dev = tokenizer(X_dev, padding=True, max_length=100,
                          truncation=True, return_tensors="np").data
  tokens_test = tokenizer(X_test, padding=True, max_length=100,
                          truncation=True, return_tensors="np").data
  model = train_model(lm, tokens_train, Y_train_bin,  len(labels),
                      epochs=2, batch_size=64, learning_rate=3e-5)
  return model

In [25]:
model = return_best_model()

Loading model....


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model....
Epoch 1/2
Epoch 2/2
Done!


In [31]:
vectorizer = TextVectorization(standardize=None, output_sequence_length=50)
# Use train and dev to create vocab - could also do just train
text_ds = tf.data.Dataset.from_tensor_slices(X_train + X_dev)
vectorizer.adapt(text_ds)
# Dictionary mapping words to idx
voc = vectorizer.get_vocabulary()
# Transform input to vectorized input
X_train_vect = vectorizer(np.array([[s] for s in X_train])).numpy()
X_val_vect = vectorizer(np.array([[s] for s in X_dev])).numpy()
X_test_vect = vectorizer(np.array([[s] for s in X_test])).numpy()

In [None]:
f1_scores_test_set = []
f1_scores_test_set = []

for i in range(0, 10):
  # Compute regular predictions
  model = return_best_model()
  y_pred_test_best_raw = model.predict(X_test_vect)
  y_pred_test_best = [[1] if n > 0.5 else [0] for [n] in model.predict(X_test_vect)]
  y_pred_test_best_f1 = f1_score(Y_test_bin, y_pred_test_best, average='macro')

  f1_scores_test_set.append(y_pred_test_best_f1)

  # Compute offensiveness metric
  y_pred_train = [[1] if n > 0.5 else [0] for [n] in model.predict(X_train_vect)]
  normalized_word_frequency_in_offensive_instances_train = compute_offensiveness_metric(X_train_vect, y_pred_train, voc)
  word_list_train_unfiltered = list(normalized_word_frequency_in_offensive_instances_train.items())
  word_list_train = list(filter(lambda e: e[1] > 0.5, word_list_train_unfiltered))
  word_list_train_keys = [e[0] for e in word_list_train]

  # This is a bit hacky, but use the tokenizer to convert the words back to their original token indexes
  word_list_tokens_train = [int(word[0]) for word in vectorizer(word_list_train_keys)]
  y_pred_word_list = classify_based_on_word_list(X_test_vect, word_list_tokens_train)
  y_pred_word_list_f1 = f1_score(Y_test_bin, y_pred_word_list, average='macro')

  f1_scores_test_set.append(y_pred_word_list_f1)