## BERT Classifications

### Dependencies and Libraries

In [1]:
import numpy as np
import pandas as pd

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('drive/MyDrive/School Work/CS4248/News Labelling Project')

### Reading in data into pd dataframes, data viewing

In [2]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

<class 'pandas.core.frame.DataFrame'>
Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
9556,1,Texans team sources said placekicker Kris Brow...
24938,3,Radioactive Water From Fukushima Is Systematic...
14017,1,"Sometimes, dreams do come true. Not this time,..."
43149,4,WASHINGTON - On a typical weekday during baseb...
16020,2,Juan Williams Gets Hit With Devastating Realit...


In [3]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))
print(df[0].value_counts())

Satire: 14047
Hoax: 6942
Propaganda: 17870
Reliable News: 9995
3    17870
1    14047
4     9995
2     6942
Name: 0, dtype: int64


### Reading in testing set

In [4]:
test_df = pd.read_csv(test_path, header=None)

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(test_df))
test_df.sample(5) # Random sample values to see

Total rows, Total Columns:       0                                                  1
0     1  When so many actors seem content to churn out ...
1     1   In what football insiders are calling an unex...
2     1  In a freak accident following Game 3 of the N....
3     1  North Koreas official news agency announced to...
4     1  The former Alaska Governor Sarah Palin would b...
...  ..                                                ...
2995  4  The Air Force mistakenly gave rival companies ...
2996  4  The United Nations climate chief on Friday cha...
2997  4  River Plate midfielder Diego Buonanotte has un...
2998  4  Lawmakers were on the brink Tuesday of exempti...
2999  4  The Pentagon, which is processing bids on a ne...

[3000 rows x 2 columns]


Unnamed: 0,0,1
1740,3,It's not just good for your physical health: ...
1663,3,While there has been a lot of research both i...
1622,3,What if a hurricane and disastrous man-made f...
1892,3,Those looking to cash in on the multi-billion...
1332,2,"[Watch] Judge Jeanine - Obamas Bloody Hands, I..."


In [5]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((test_df[0] == label).sum()))
print(test_df[0].value_counts())

Satire: 750
Hoax: 750
Propaganda: 750
Reliable News: 750
1    750
2    750
3    750
4    750
Name: 0, dtype: int64


### Preprocessing Functions

In [6]:
seq_len = 1000
num_train_samples = len(df)
num_test_samples = len(test_df)

Xids = np.zeros((num_train_samples, seq_len))
Xmask = np.zeros((num_train_samples, seq_len))

Xids.shape

(48854, 1000)

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, text in enumerate(df[1]):
    tokens = tokenizer.encode_plus(text, max_length=seq_len, truncation=True, 
                                    padding='max_length', add_special_tokens=True, return_tensors='tf')
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

Downloading: 100%|██████████| 208k/208k [00:00<00:00, 287kB/s]  
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 29.0kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]


In [10]:
Xids

array([[ 101.,  138., 1376., ...,    0.,    0.,    0.],
       [ 101., 1109., 5094., ...,    0.,    0.,    0.],
       [ 101., 2711., 3711., ..., 1105., 1103.,  102.],
       ...,
       [ 101., 1697., 7085., ...,    0.,    0.,    0.],
       [ 101., 1109., 2073., ...,    0.,    0.,    0.],
       [ 101., 1109., 3424., ...,    0.,    0.,    0.]])

In [11]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [13]:
arr = df[0].values
arr

array([1, 1, 1, ..., 4, 4, 4], dtype=int64)

In [14]:
labels = np.zeros((num_train_samples, arr.max()))
labels.shape

(48854, 4)

In [19]:
labels[np.arange(num_train_samples), arr-1] = 1
labels

array([0., 1., 0., 0.])

In [20]:
import tensorflow as tf

In [21]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(1000,), dtype=tf.float64, name=None), TensorSpec(shape=(1000,), dtype=tf.float64, name=None), TensorSpec(shape=(4,), dtype=tf.float64, name=None))>

In [23]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [24]:
dataset = dataset.map(map_func)
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(1000,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(1000,), dtype=tf.float64, name=None)}, TensorSpec(shape=(4,), dtype=tf.float64, name=None))>

In [25]:
batch_size = 8 # Increase if got more VRAM

dataset=dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(8, 1000), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(8, 1000), dtype=tf.float64, name=None)}, TensorSpec(shape=(8, 4), dtype=tf.float64, name=None))>

In [27]:
split = 0.9 # train test split
print(num_train_samples)
size = int((num_train_samples/batch_size) * split) 
size

48854


5496

In [28]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

del dataset

In [None]:
import os
from tensorflow.python.client import device_lib
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")
print(device_lib.list_local_devices())

In [29]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')
bert.summary()

Downloading: 100%|██████████| 502M/502M [00:05<00:00, 91.8MB/s] 
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [35]:
# Input layers
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]

print(arr.max())

x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(arr.max(), activation='softmax', name='outputs')(x)

4


In [36]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# Enable to allow training of pre-trained BERT model
model.layers[2].trainable = False
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 1000)]       0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 1000)]       0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 1000                                         

In [38]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [39]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [40]:
trained_model = model.fit(train_ds, validation_data=val_ds, epochs=3)

loss, accuracy = model.evaluate(val_ds)
print('Loss: ' + str(loss) + '    ' + 'Accuracy: ' + str(accuracy))

Epoch 1/3


ResourceExhaustedError: Graph execution error:

Detected at node 'model_3/bert/encoder/layer_._7/attention/self/dropout_22/dropout/random_uniform/RandomUniform' defined at (most recent call last):
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
      await result
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2768, in run_cell
      result = self._run_cell(
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2814, in _run_cell
      return runner(coro)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3012, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3191, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3251, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\dojh1\AppData\Local\Temp\ipykernel_19900\2375825585.py", line 1, in <module>
      trained_model = model.fit(train_ds, validation_data=val_ds, epochs=3)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 859, in train_step
      y_pred = self(x, training=True)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\functional.py", line 451, in call
      return self._run_internal_graph(
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\functional.py", line 589, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 871, in call
      encoder_outputs = self.encoder(
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 535, in call
      for i, layer_module in enumerate(self.layer):
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 541, in call
      layer_outputs = layer_module(
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 451, in call
      self_attention_outputs = self.attention(
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 367, in call
      self_outputs = self.self_attention(
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 310, in call
      attention_probs = self.dropout(inputs=attention_probs, training=training)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\layers\core\dropout.py", line 111, in call
      output = control_flow_util.smart_cond(training, dropped_inputs,
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\utils\control_flow_util.py", line 105, in smart_cond
      return tf.__internal__.smart_cond.smart_cond(
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\layers\core\dropout.py", line 108, in dropped_inputs
      return self._random_generator.dropout(
    File "C:\Users\dojh1\AppData\Roaming\Python\Python39\site-packages\keras\backend.py", line 1940, in dropout
      return tf.nn.dropout(inputs, rate=rate, noise_shape=noise_shape,
Node: 'model_3/bert/encoder/layer_._7/attention/self/dropout_22/dropout/random_uniform/RandomUniform'
OOM when allocating tensor with shape[8,12,1000,1000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model_3/bert/encoder/layer_._7/attention/self/dropout_22/dropout/random_uniform/RandomUniform}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_169423]

In [None]:
model.save('BERT_Model')