In [None]:
!pip install nltk numpy openpyxl Pillow pyparsing scikit-learn scipy keras scipy tensorflow transformers tensorflow-text tf-models-official

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 11.3MB/s 
[?25hCollecting tensorflow-text
[?25l  Downloading https://files.pythonhosted.org/packages/b6/c0/c0fed4301f592c3b56638ae7292612c17d91a43891ba1aaf9636d535beae/tensorflow_text-2.4.3-cp37-cp37m-manylinux1_x86_64.whl (3.4MB)
[K     |████████████████████████████████| 3.4MB 38.3MB/s 
[?25hCollecting tf-models-official
[?25l  Downloading https://files.pythonhosted.org/packages/57/4a/23a08f8fd2747867ee223612e219eeb0d11c36116601d99b55ef3c72e707/tf_models_official-2.4.0-py2.py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 30.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895

In [None]:
import nltk
import re
from openpyxl import load_workbook
from sklearn import preprocessing
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import numpy as np
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from transformers import BertTokenizer, TFBertForSequenceClassification
import os
import shutil
import matplotlib.pyplot as plt
tf.get_logger().setLevel("ERROR")

In [None]:
ALL_CLASSES = [
    "content discussion", "greeting", "logistics", "assignment instructions", "instruction question",
    "assignment question", "general comment", "response", "incomplete/typo", "feedback",
    "emoticon/non-verbal", "discussion wrap-up", "outside material", "opening statement",
    "general question", "content question", "general discussion"
]
DATASET_PATH = "dataset.xlsx"

# Opens a single sheet for reading, returns a list of messages and a list of classes
def read_worksheet(filename, sheet_name, all_classes, label_encoder, no_columns):
    wb = load_workbook(filename, read_only=True)
    ws = wb[sheet_name]
    column_labels = next(ws.rows)
    X = []
    y = []
    for row in ws.rows:
        if row[0].value is None:
            break
        elif row[0].value.strip() == "Course":  # Skip the first line which only contains column titles
            continue
        new_entry = {}
        for i in range(no_columns):
            new_entry[column_labels[i].value.lower().replace(" ", "_")] = str(row[i].value)
        c_list = [new_entry["codepreliminary"].lower().strip()]
        if c_list[0] not in all_classes:
            c_list = new_entry["codepreliminary"].lower().strip().split("/")
        # If there are 2 classes listed in document add message twice (1 for each class)
        for c in c_list:
            new_entry["codepreliminary"] = label_encoder.transform([c])[0]
            X.append(new_entry["message"])
            y.append(label_encoder.transform([c])[0])
    wb.close()
    return X, y

le = preprocessing.LabelEncoder()
le.fit(ALL_CLASSES)

LabelEncoder()

In [None]:
X, y = read_worksheet(DATASET_PATH, "Discussion only data", ALL_CLASSES, le, 10)
X2, y2 = read_worksheet(DATASET_PATH, "CREW data", ALL_CLASSES, le, 11)
X.extend(X2)  # Join worksheets into a single dataset
y.extend(y2)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
max_length = 256
batch_size = 6

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
def map_data_to_dict(input_ids, attention_masks, token_type_ids, classes):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, classes

In [None]:
def encode_data(data, classes):
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  class_list = []
  for i, msg in enumerate(data):
    input = tokenizer.encode_plus(
              msg,
              add_special_tokens=True,
              max_length=max_length,
              pad_to_max_length=True,
              return_attention_mask=True
            )
    input_ids_list.append(input["input_ids"])
    token_type_ids_list.append(input["token_type_ids"])
    attention_mask_list.append(input["attention_mask"])
    class_list.append([classes[i]])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, class_list)).map(map_data_to_dict)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset = encode_data(X_train, y_train).shuffle(5000).batch(batch_size)
test_dataset = encode_data(X_test, y_test).batch(batch_size)



In [None]:
def build_model(learning_rate=3e-5, no_epochs=5):
  model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(ALL_CLASSES))
  optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

  model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
  return model

In [None]:
def get_results(test_data, model, print_output=True):
  results_true = test_data.unbatch()
  results_true = np.asarray([element[1].numpy() for element in results_true])
  results = model.predict(test_data)
  results_predicted = np.argmax(results.logits, axis=1)

  accuracy = accuracy_score(results_true, results_predicted)
  precision = precision_score(results_true, results_predicted, zero_division=1, average="weighted")
  recall = recall_score(results_true, results_predicted, zero_division=1, average="weighted")
  f1 = f1_score(results_true, results_predicted, zero_division=1, average="weighted")

  if print_output:
    print("Accuracy score: {}".format(accuracy))
    print("Precision score: {}".format(precision))
    print("Recall score: {}".format(recall))
    print("F1 score: {}".format(f1))

  return accuracy, precision, recall, f1

In [None]:
model = build_model()
history = model.fit(train_dataset, epochs=no_epochs, validation_data=test_dataset)
get_results(test_dataset, model)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
no_folds = 5
no_epochs = 5
kfold = KFold(no_folds, shuffle=True)

# Convert to numpy arrays so indexing with list of indexes works
X = np.array(X) 
y = np.array(y)

test_accuracy_log = []

accuracy = []
precision = []
recall = []
f1 = []
for train, test in kfold.split(X, y):
  X_train = X[train]
  y_train = y[train]
  X_test = X[test]
  y_test = y[test]

  train_dataset = encode_data(X_train, y_train).shuffle(5000).batch(batch_size)
  test_dataset = encode_data(X_test, y_test).batch(batch_size)
  
  model = build_model()
  model.fit(train_dataset, epochs=no_epochs, validation_data=test_dataset, verbose=1)

  acc, pre, rec, f = get_results(test_dataset, model, print_output=False)
  accuracy.append(acc)
  precision.append(pre)
  recall.append(rec)
  f1.append(f)

print("Accuracy score: {}".format(np.mean(accuracy)))
print("Precision score: {}".format(np.mean(precision)))
print("Recall score: {}".format(np.mean(recall)))
print("F1 score: {}".format(np.mean(f1)))

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy score: 0.7236193293885602
Precision score: 0.7652244631480469
Recall score: 0.7236193293885602
F1 score: 0.7058959170966468


In [None]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training')
    plt.plot(x, val_acc, 'r', label='Validation')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training')
    plt.plot(x, val_loss, 'r', label='Validation')
    plt.title('Training and validation loss')
    plt.legend()
  
plot_history(history)