In [None]:
!pip install texthero
!pip install tensorflow_addons
!pip install tensorflow_hub
!pip install tensorflow_text
!pip install spacy==3.3

# **BERT Text Classification - Binary Class**

> [**SMS Spam Collection Dataset**](https://www.kaggle.com/uciml/sms-spam-collection-dataset) - Collection of SMS messages tagged as spam or legitimate.

In [None]:
# Import Library.
import os, sys, warnings, logging

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

import pandas as pd
import texthero as hero
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow_addons.metrics import CohenKappa
import tensorflow_hub as hub
import tensorflow_text as text

# Import the BERT model.
bert_preprocess = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
)
bert_encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
)

# Model Configuration.
BATCH_SIZE = 64
NO_EPOCHS = 10
NO_CLASSES = 2
VALIDATION_SPLIT = 0.2
VERBOSITY = 1
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=4, min_lr=0.00001, verbose=1),
    tf.keras.callbacks.ModelCheckpoint("spamClassifier.h5", verbose=1, save_best_only=True),
]

# Model Architecture/Pipeline.
def create_model():
    # Pretrained BERT model.
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)
    # Fine-Tuning BERT Model.
    ml = tf.keras.layers.Dropout(0.25)(outputs["pooled_output"])
    ml = tf.keras.layers.BatchNormalization()(ml)
    ml = tf.keras.layers.Dense(units=100, activation="relu")(ml)
    ml = tf.keras.layers.Dropout(0.25)(ml)
    ml = tf.keras.layers.BatchNormalization()(ml)
    ml = tf.keras.layers.Dense(units=1, activation="sigmoid")(ml)
    # Final Model Construction.
    model = tf.keras.Model(inputs=[text_input], outputs=[ml])
    # Compile the Model.
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", CohenKappa(num_classes=NO_CLASSES)],
    )
    return model


if __name__ == "__main__":
    # Extract the Dataset.
    try:
        data = pd.read_csv("spam.csv", encoding="ISO-8859-1")
    except Exception as e:
        logger.exception(
            "Unable to download training CSV, check your internet connection. Error: %s", e
        )

    # Text Cleaning and Preprocessing.
    data["sms"] = data["v2"].pipe(hero.clean).pipe(hero.remove_urls)
    data["class"] = data["v1"].apply(lambda x: 1 if x == "spam" else 0)

    # Split Dataset into Training and Test Set.
    X_train, X_test, y_train, y_test = train_test_split(
        data["sms"], data["class"], test_size=0.2, random_state=1, stratify=data["class"],
    )

    # Call the Model Architecture.
    model = create_model()

    # Build the Model.
    model.build(X_train.shape)
    model.summary()

    # Fit the Model.
    model.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        batch_size=BATCH_SIZE,
        epochs=NO_EPOCHS,
        verbose=VERBOSITY,
        validation_split=VALIDATION_SPLIT,
        callbacks=my_callbacks,
    )

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     {'input_type_ids':   0           ['input_1[0][0]']                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [None]:
# Load Model.
spam_model = tf.keras.models.load_model(
    "spamClassifier.h5", custom_objects={"KerasLayer": hub.KerasLayer}
)

reviews = [
    "Reply to win £100 weekly! Where will the 2022 FIFA World Cup going to be held? Send STOP to 87239 to end service.",
    "Your account password has expired. Please reset your account password to continue the service.",
    "You are awarded a brand new iPhone 13! Please call 09061221061 on your Mobile. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16. p pÂ£3.99",
    "Your 500 free text messages are valid until 31 December 2021. Please call customer care for more details.",
    "Hey Rocky, I have 2 free tickets for tomorrow's cricket game. So are you going to come with me?",
    "Why don't you wait 'til at least Wednesday to see if you get your credit card?",
    "Your Bank account has been compromised. You must update it immediately or your account will get closed. Click HERE to update your account.",
    "Your salary of Rs. 30,000 has been credited to your bank account XXXXX4095.",
    "£50 Gift Card for Amazon! Complete our Quick Survey to see if you qualify £50 Gift Card for Amazon. Click HERE to get started.",
]

spam_model.predict(reviews)



array([[0.9417401 ],
       [0.7171347 ],
       [0.9891326 ],
       [0.88464123],
       [0.11771929],
       [0.02290148],
       [0.85963535],
       [0.64502424],
       [0.9430161 ]], dtype=float32)

# **Multi-Class Classification using BERT Model.**

> [**Kaggle Dataset**](https://www.kaggle.com/datasets/balatmak/newsgroup20bbcnews) - newsgroup20-bbc-news

In [None]:
# Import Library.
import os, sys, warnings, logging

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

import pandas as pd
import numpy as np
import texthero as hero
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow_addons.metrics import CohenKappa
import tensorflow_hub as hub
import tensorflow_text as text

# Import the BERT model.
bert_preprocess = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
)
bert_encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
)

# Model Configuration.
EPOCHS = 20
NO_CLASSES = 5
BATCH_SIZE = 8
VALIDATION_SPLIT = 0.2
VERBOSITY = 1
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=5, min_lr=0.00001, verbose=1),
    tf.keras.callbacks.ModelCheckpoint("news_category.h5", verbose=1, save_best_only=True),
]

# Model Architecture/Pipeline.
def create_model():
    # Pretrained BERT model.
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)
    # Fine-Tuning BERT Model.
    out = tf.keras.layers.Dense(256, activation="relu", kernel_regularizer="l2")(outputs["pooled_output"])
    out = tf.keras.layers.Dropout(0.4)(out)
    out = tf.keras.layers.Dense(128, activation="relu", kernel_regularizer="l2")(out)
    out = tf.keras.layers.Dropout(0.4)(out)
    out = tf.keras.layers.Dense(128, activation="relu", kernel_regularizer="l2")(out)
    out = tf.keras.layers.Dropout(0.4)(out)
    out = tf.keras.layers.Dense(NO_CLASSES, activation="softmax")(out)
    # Final Model Construction.
    model = tf.keras.Model(inputs=[text_input], outputs=[out])
    # Compile the Model.
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["AUC", CohenKappa(num_classes=NO_CLASSES)],
    )
    return model


if __name__ == "__main__":
    # Extract the Dataset.
    try:
        data = pd.read_csv(
            "https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv"
        )
        print(data.head())
    except Exception as e:
        logger.exception(
            "Unable to download training CSV, check your internet connection. Error: %s", e
        )

    print("\nShape of the dataset is", data.shape, "\n")

    print("Class Frequency: \n", data["category"].value_counts(), "\n")

    # Text Preprocessing and Encode Categorical Column.
    data["text"] = data["text"].pipe(hero.clean).pipe(hero.remove_urls)
    data["category"] = data["category"].map(
        {"sport": 0, "business": 1, "politics": 2, "tech": 3, "entertainment": 4}
    )
    y = tf.keras.utils.to_categorical(data["category"].values, num_classes=NO_CLASSES)

    # Split Dataset into Training and Test Set.
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"], y, test_size=0.2, random_state=1, stratify=y
    )

    # Call the Model Architecture.
    model = create_model()

    # Build the Model.
    model.build(X_train.shape)
    model.summary()

    # Fit the Model.
    model.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        batch_size=BATCH_SIZE,
        epochs=NO_EPOCHS,
        verbose=VERBOSITY,
        validation_split=VALIDATION_SPLIT,
        callbacks=my_callbacks,
    )

        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...

Shape of the dataset is (2225, 2) 

Class Frequency: 
 sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64 

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                      

In [None]:
# Load Model.
news_category = tf.keras.models.load_model(
    "news_category.h5", custom_objects={"KerasLayer": hub.KerasLayer}
)


def predict_class(reviews):
    """Predict Class of Input Text."""
    import numpy as np

    return [np.argmax(pred) for pred in news_category.predict(reviews)]


# Predict Result.
reviews = [
    "iPhone Users Alert! THIS Apple iPhone could cost less than Rs 20,000.",
    "Russia-Ukraine conflict: After a 'genius' remark, Donald Trump says Putin playing Biden like a drum.",
    "Czech Republic join Poland, Sweden in refusing to play Russia in 2022 World Cup playoffs.",
    "Housebuilders must ‘go further’ in remediation pledge.",
    "CODA lands top SAG award on road to the Oscars and more.",
]

predict_class(reviews)



[1, 2, 1, 2, 4]

# **Multi-label Classification using BERT Model.**

In [None]:
# Import Library.
import pandas as pd
import numpy as np
import ast
import texthero as hero
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow_addons.metrics import CohenKappa
import tensorflow_hub as hub
import tensorflow_text as text

# Import the BERT model.
bert_preprocess = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
)
bert_encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
)

# Load Dataset.
data = pd.read_csv("so_dataset_2_tags.csv")
data.head()

Unnamed: 0,title,tags,mysql,python,php
0,Flask-SQLAlchemy - When are the tables/databas...,"['python', 'mysql']",1,1.0,0.0
1,Combining two PHP variables for MySQL query,"['php', 'mysql']",1,0.0,1.0
2,'Counting' the number of records that match a ...,"['php', 'mysql']",1,0.0,1.0
3,Insert new row in a table and auto id number. ...,"['php', 'mysql']",1,0.0,1.0
4,Create Multiple MySQL tables using PHP,"['php', 'mysql']",1,0.0,1.0


In [None]:
# Making the Tags in the form: ['item1', 'item2', ..., 'itemN']
data["tags"] = data["tags"].apply(lambda x: ast.literal_eval(x))

# Convert "mysql" column to float datatype.
data["mysql"] = data["mysql"].astype(float)

# Text Cleaning and Preprocessing.
data["title"] = data["title"].pipe(hero.clean).pipe(hero.remove_urls)

# Split Dataset into Dependent and Independent Features.
X = data["title"]
y = data[["mysql", "python", "php"]]

# Split Dataset into Training and Test Set.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=1, stratify=y
)

# Model Configuration.
BATCH_SIZE = 64
NO_EPOCHS = 20
NO_CLASSES = y.shape[1]
VALIDATION_SPLIT = 0.2
VERBOSITY = 1
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=5, min_lr=0.00001, verbose=1),
    tf.keras.callbacks.ModelCheckpoint("tagsClassify.h5", verbose=1, save_best_only=True),
]

# Model Architecture/Pipeline.
def create_model():
    # Pretrained BERT model.
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)
    # Fine-Tuning BERT Model.
    ml = tf.keras.layers.Dropout(0.25)(outputs["pooled_output"])
    ml = tf.keras.layers.BatchNormalization()(ml)
    ml = tf.keras.layers.Dense(units=100, activation="relu")(ml)
    ml = tf.keras.layers.Dropout(0.25)(ml)
    ml = tf.keras.layers.BatchNormalization()(ml)
    ml = tf.keras.layers.Dense(units=NO_CLASSES, activation="sigmoid")(ml)
    # Final Model Construction.
    model = tf.keras.Model(inputs=[text_input], outputs=[ml])
    # Compile the Model.
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", CohenKappa(num_classes=NO_CLASSES)],
    )
    return model


# Call the Model Architecture.
model = create_model()

# Build the Model.
model.build(X_train.shape)
model.summary()

# Fit the Model.
model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    batch_size=BATCH_SIZE,
    epochs=NO_EPOCHS,
    verbose=VERBOSITY,
    validation_split=VALIDATION_SPLIT,
    callbacks=my_callbacks,
)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer_6 (KerasLayer)     {'input_type_ids':   0           ['input_2[0][0]']                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                    

<keras.callbacks.History at 0x7f97e40a4610>

In [None]:
# Load Model.
tagsModel = tf.keras.models.load_model(
    "tagsClassify.h5", custom_objects={"KerasLayer": hub.KerasLayer}
)

reviews = [
    "How do I merge two dictionaries in a single expression (taking union of dictionaries)?",
    "How to connect Python in MongoDB?",
    "What are the steps to create a new database using MySQL and PHP?",
]

tagsModel.predict(reviews)



array([[0.5547276 , 0.59171814, 0.5183102 ],
       [0.45760947, 0.5581999 , 0.51863074],
       [0.49568376, 0.5485536 , 0.5535065 ]], dtype=float32)