<a href="https://colab.research.google.com/github/CodeMonkey01/DataMiningI/blob/main/ANN_with_BERT_with_Stemming_and_stop_word_removal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ANN with BERT
In this notebook I tried to solve the classification model with an ANN based on pretrained BERT layers.

## TODO:
- [x] Stemming
- [x] Remove Stop Words

## Results
accuracy: 0.8263 - precision: 0.8082 - recall: 0.8557

Training time: 5 * 19 Minutes

## Model
Link to the model: https://drive.google.com/file/d/1-VYFlAvXt6R42VO_EuK4qjVCJUZ_NE4i/view?usp=sharing (google drive)

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd '/content/drive/MyDrive/'

    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
      print('Not connected to a GPU')
    else:
      print(gpu_info)
except ImportError as e:
    pass

Mounted at /content/drive/
/content/drive/MyDrive
Sat May 14 11:33:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

In [None]:
!pip install tensorflow_text
!pip install tensorflow_hub

Collecting tensorflow_text
  Downloading tensorflow_text-2.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 6.0 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 26.3 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.2 tf-estimator-nightly-2.8.0.dev2021122109


In [63]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [64]:
df = pd.read_csv('/content/drive/MyDrive/Data Mining/dataset.txt')
df.describe()

Unnamed: 0,text,humor
count,200000,200000
unique,200000,2
top,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
freq,1,100000


# Check for imbalance
The dataset is equally balanced. Therefore, we do not need to rebalance the dataset.

In [65]:
df["humor"].value_counts()

False    100000
True     100000
Name: humor, dtype: int64

# Preprocessing

In [66]:
# Transform class from Boolean to integer value
df['class']=df['humor'].apply(lambda x: 1 if x==True else 0)

In [67]:
# Remove stop words
#from gensim.parsing.preprocessing import remove_stopwords

#df['text']=df['text'].apply(lambda x: remove_stopwords(x))

In [79]:
# Stemming
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

token_pattern = re.compile(r"(?u)\b\w\w+\b")

ps = PorterStemmer()

nltk.download('punkt')
nltk.download('stopwords')

my_stopwords = set(stopwords.words('english'))

df['stemmed_stop_removed']=df['text'].apply(lambda x: ' '.join([ps.stem(y) for y in token_pattern.findall(x) if y not in my_stopwords]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
df.head(100)

Unnamed: 0,text,humor,class,stemmed_stop_removed
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False,0,joe biden rule 2020 bid guy run
1,Watch: darvish gave hitter whiplash with slow ...,False,0,watch darvish gave hitter whiplash slow pitch
2,What do you call a turtle without its shell? d...,True,1,what call turtl without shell dead
3,5 reasons the 2016 election feels so personal,False,0,reason 2016 elect feel person
4,"Pasco police shot mexican migrant from behind,...",False,0,pasco polic shot mexican migrant behind new au...
...,...,...,...,...
95,Starting a cover band called a book so no one ...,True,1,start cover band call book one judg us
96,Veterinarian accused of shooting neighbors' do...,False,0,veterinarian accus shoot neighbor dog head
97,Christina aguilera's alleged new house comes w...,False,0,christina aguilera alleg new hous come famou n...
98,I met a horse who keeps talking about the apoc...,True,1,met hors keep talk apocalyps told end neigh


In [81]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['stemmed_stop_removed'],df['class'], stratify=df['class'])

# BERT

In [82]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [84]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

## Test embedding
Test word embedding from pretrained BERT model with a real sentence from dataset.

In [None]:
test_sentence = df["stemmed_stop_removed"][100]
print("Test sentence:")
print(test_sentence)
print("Test sentence (word embedding):")
print(get_sentence_embeding([test_sentence]))

# Build model

In [85]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [86]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [87]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [88]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f47c3eb3150>

In [89]:
model.evaluate(X_test, y_test)



[0.3929470181465149,
 0.8263000249862671,
 0.8081674575805664,
 0.8557199835777283]

In [90]:
#model.save("/content/drive/MyDrive/Data Mining/stop words")
model.save_weights("/content/drive/MyDrive/Data Mining/stemming and no stop words/model.h5")

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm 

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
print(classification_report(y_test, y_predicted))

# Own test

In [None]:
jokes = [
    'What’s the best thing about Switzerland? I don’t know, but the flag is a big plus.',
    'I study Business Informatics at the University of Mannheim!',
    'I invented a new word! Plagiarism!',
    'Did you hear about the mathematician who’s afraid of negative numbers? He’ll stop at nothing to avoid them.',
    'My name is Elias.'
]
model.predict(jokes)

array([[0.7106663 ],
       [0.7403219 ],
       [0.81238645],
       [0.7502034 ],
       [0.95888543]], dtype=float32)

array([[0.7106663 ],
       [0.7403219 ],
       [0.81238645],
       [0.7502034 ],
       [0.95888543]], dtype=float32)