<a href="https://colab.research.google.com/github/CodeMonkey01/DataMiningI/blob/main/ANN/Option_A/ANN-FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ANN with BERT
In this notebook we tried to solve the classification model with an ANN based on pretrained BERT layers.

This notebook shows the training of the final model.

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd '/content/drive/MyDrive/'

    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
      print('Not connected to a GPU')
    else:
      print(gpu_info)
except ImportError as e:
    pass

Mounted at /content/drive/
/content/drive/MyDrive
Fri May 27 05:49:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

In [None]:
!pip install tensorflow_text
!pip install tensorflow_hub
!pip install transformers

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Data Mining/dataset.txt')
df.describe()

Unnamed: 0,text,humor
count,200000,200000
unique,200000,2
top,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
freq,1,100000


# Preprocess data

In [5]:
# Transform class from Boolean to integer value
df['class']=df['humor'].apply(lambda x: 1 if x==True else 0)

In [6]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

MAX_LEN = 128
#pad_to_max_length=True,
df['bert_preprocessed']=df['text'].apply(lambda x: " ".join(list(tokenizer.convert_ids_to_tokens(tokenizer.encode(x, add_special_tokens=True, max_length=MAX_LEN, truncation=True)))))

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
df.head()

Unnamed: 0,text,humor,class,bert_preprocessed
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False,0,[CLS] Joe bid ##en rules out 2020 bid : ' guys...
1,Watch: darvish gave hitter whiplash with slow ...,False,0,[CLS] Watch : da ##r ##vis ##h gave hitter whi...
2,What do you call a turtle without its shell? d...,True,1,[CLS] What do you call a turtle without its sh...
3,5 reasons the 2016 election feels so personal,False,0,[CLS] 5 reasons the 2016 election feels so per...
4,"Pasco police shot mexican migrant from behind,...",False,0,[CLS] Pa ##sco police shot me ##xi ##can migra...


In [8]:
X = df['bert_preprocessed']
Y = df['class']

In [9]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4")

# Keras model

In [10]:
def build_model() -> tf.keras.Model:
    # Bert layers
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)

    # Neural network layers
    l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
    l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

    # Use inputs and outputs to construct a final model
    model = tf.keras.Model(inputs=[text_input], outputs = [l])

    #model.summary()

    return model

# Train model

Here we train the model with the identified adam learning rate, adam epsilon and the batch size. 

To ensure reaching the maximum accuracy we used the EarlyStopping class from keras. This class allows the training to run until a maximum of 50 epochs but will be stopped, as soon as the loss does not decrease for two epochs.

Besides EarlyStopping we also used a k-4 fold to make sure every row is once in the training and once in the test set. 

In [None]:
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

ADAM_LEARNING_RATE = 0.0007
ADAM_EPSILON = 1e-06

EPOCH = 50

BATCH_SIZE = 64

PATIENCE = 2

base_path = "/content/drive/MyDrive/Data Mining/bert/"

METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
]

def train_evaluate(model, run, x_train, y_train, x_test, y_test):
    STAMP = 'model_run_%.2f'%(run)
    early_stopping = EarlyStopping(monitor='val_loss', patience=PATIENCE)
    bst_model_path = base_path + STAMP + '.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=ADAM_LEARNING_RATE, epsilon=ADAM_EPSILON),
                  loss=tf.keras.losses.BinaryCrossentropy(),                  
                  metrics=METRICS)
    

    X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, test_size=0.2)

    hist = model.fit(X_tra, y_tra, validation_data=(X_val, y_val), epochs=EPOCH, batch_size=BATCH_SIZE, shuffle=True, callbacks=[early_stopping, model_checkpoint])

    return model.evaluate(x_test, y_test)

kFold = StratifiedKFold(n_splits=4)
accuracy_list = []
precision_list = []
recall_list = []
i = 1
for train, test in kFold.split(X, Y):
    model = build_model()
    _, accuracy, precision, recall = train_evaluate(model, i, X[train], Y[train], X[test], Y[test])
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    i = i + 1

print(accuracy_list)
print(precision_list)
print(recall_list)

print(f"Accuracy: {sum(accuracy_list) / len(accuracy_list)}")
print(f"Precision: {sum(precision_list) / len(precision_list)}")
print(f"Recall: {sum(recall_list) / len(recall_list)}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
 349/1875 [====>.........................] - ETA: 17:52 - loss: 0.2741 - accuracy: 0.8913 - precision: 0.8909 - recall: 0.8889

# Evaluate trained model
The training took around 12 hours. Each fold created an own model which can be downloaded from the following google drive links:

- Fold 1:
- Fold 2:
- Fold 3:
- Fold 4:

In [None]:
# model.save_weights("/content/drive/MyDrive/Data Mining/bert_seperator/weights.md5")

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm 

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
print(classification_report(y_test, y_predicted))

# Own test

In [None]:
jokes = [
    'What’s the best thing about Switzerland? I don’t know, but the flag is a big plus.',
    'I study Business Informatics at the University of Mannheim!',
    'I invented a new word! Plagiarism!',
    'Did you hear about the mathematician who’s afraid of negative numbers? He’ll stop at nothing to avoid them.',
    'My name is Elias.'
]

jokes_processed = [" ".join(list(tokenizer.convert_ids_to_tokens(tokenizer.encode(i, add_special_tokens=True, max_length=MAX_LEN, truncation=True)))) for i in jokes]

model.predict(jokes_processed)