In [10]:
from flask import Flask, render_template, request
from pickle import load
import pandas as pd # read the csv
import re # regex to detect username, url, html entity 
import nltk # to use word tokenize (split the sentence into words)
from nltk.corpus import stopwords # to remove the stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model


from keras.utils import to_categorical

In [4]:
from tensorflow.keras.metrics import Precision, Recall
import tensorflow as tf

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name="f1_score", **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        precision = self.precision.result()
        recall = self.recall.result()
        return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

In [13]:
model = load_model("./saved_model/my_model.keras")

with open("./saved_model/tokenizer.pkl", "rb") as file:
    tokenizer = load(file)

In [30]:
#notes : all of the function taking 1 text at a time
stop_words = set(stopwords.words('english'))
# add rt to remove retweet in dataset (noise)
stop_words.add("rt")

# remove html entity:
def remove_entity(raw_text):
    entity_regex = r"&[^\s;]+;"
    text = re.sub(entity_regex, "", raw_text)
    return text

# change the user tags
def change_user(raw_text):
    regex = r"@([^ ]+)"
    text = re.sub(regex, "user", raw_text)

    return text

# remove urls
def remove_url(raw_text):
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_regex, '', raw_text)

    return text

# remove unnecessary symbols
def remove_noise_symbols(raw_text):
    text = raw_text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace("!", '')
    text = text.replace("`", '')
    text = text.replace("..", '')

    return text

# remove stopwords
def remove_stopwords(raw_text):
    tokenize = nltk.word_tokenize(raw_text)
    text = [word for word in tokenize if not word.lower() in stop_words]
    text = " ".join(text)

    return text

## this function in to clean all the dataset by utilizing all the function above
def preprocess(datas):
    clean = []
    # change the @xxx into "user"
    clean = [change_user(text) for text in datas]
    # remove emojis (specifically unicode emojis)
    clean = [remove_entity(text) for text in clean]
    # remove urls
    clean = [remove_url(text) for text in clean]
    # remove trailing stuff
    clean = [remove_noise_symbols(text) for text in clean]
    # remove stopwords
    clean = [remove_stopwords(text) for text in clean]

    clear_output = tokenizer.texts_to_sequences(clean)
    input = pad_sequences(clear_output, maxlen=26)
    return input

In [17]:
# @app.route('/predict', methods=['POST'])
def predict(message):
    try:
        # print("model -> ", model)
        # message = request.form.get('message', '').strip()
        print(f"Received message: '{message}'")  # Debugging: Log the received message
        # if not message:
            # return render_template('index.html', prediction_text="Please enter a valid message.")
        
        processed_features = preprocess(message)
        prediction = model.predict(processed_features)
        output = {0: "Hate speech Detected", 1: "Offensive language", 2: "No hate speech detected"}
        print(prediction[0])
        return render_template('index.html', prediction_text=f'Prediction: {output[prediction[0]]}')
    except Exception as e:
        print(f"Error during prediction: {e}")
        return render_template('index.html', prediction_text="Error: Unable to process the request.")

In [25]:
tokenized = predict(["hello himanshu my name is deepak"])
x = pad_sequences(tokenized, maxlen=26)
x

Received message: '['hello himanshu my name is deepak']'


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 1198,  172]], dtype=int32)

In [29]:
import numpy as np
y_pred = np.argmax(model.predict(x))
y_pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


2