## BERT Classifications
- Credits: https://www.youtube.com/watch?v=pjtnkCGElcE&ab_channel=JamesBriggs

### Dependencies and Libraries

In [None]:
import numpy as np
import pandas as pd

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('drive/MyDrive/School Work/CS4248/News Labelling Project')

import tensorflow as tf
# import os
# from tensorflow.python.client import device_lib
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# if tf.test.gpu_device_name():
#     print('GPU found')
# else:
#     print("No GPU found")
# print(device_lib.list_local_devices())

### Reading in data into pd dataframes, data viewing

In [None]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

In [None]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))
print(df[0].value_counts())

### Reading in testing set

In [None]:
test_df = pd.read_csv(test_path, header=None)

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(test_df))
test_df.sample(5) # Random sample values to see

In [None]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((test_df[0] == label).sum()))
print(test_df[0].value_counts())

### Preprocessing Functions

In [None]:
seq_len = 1000
num_train_samples = len(df)
num_test_samples = len(test_df)

Xids = np.zeros((num_train_samples, seq_len))
Xmask = np.zeros((num_train_samples, seq_len))

Xids.shape

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, text in enumerate(df[1]):
    tokens = tokenizer.encode_plus(text, max_length=seq_len, truncation=True, 
                                    padding='max_length', add_special_tokens=True, return_tensors='tf')
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

In [None]:
Xids

In [None]:
Xmask

In [None]:
arr = df[0].values
arr

In [None]:
labels = np.zeros((num_train_samples, arr.max()))
labels.shape

In [None]:
labels[np.arange(num_train_samples), arr-1] = 1
labels

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

dataset.take(1)

In [None]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [None]:
dataset = dataset.map(map_func)
dataset.take(1)

In [None]:
batch_size = 16 # Increase if got more VRAM

dataset=dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

In [None]:
split = 0.9 # train test split
print(num_train_samples)
size = int((num_train_samples/batch_size) * split) 
size

In [None]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

del dataset

In [None]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')
bert.summary()

In [None]:
# Input layers
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]

print(arr.max())

x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(arr.max(), activation='softmax', name='outputs')(x)

In [None]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# Enable to allow training of pre-trained BERT model
model.layers[2].trainable = False
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
trained_model = model.fit(train_ds, validation_data=val_ds, epochs=3)

loss, accuracy = model.evaluate(val_ds)
print('Loss: ' + str(loss) + '    ' + 'Accuracy: ' + str(accuracy))

In [None]:
model.save('BERT_Model')