In [14]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np

In [15]:
model=load_model('model.keras')

In [16]:
import json
with open('config.json', 'r') as f:
    config = json.load(f)

max_len = config['max_len']
class_labels = config['class_labels']

print(max_len, class_labels)

81 {'0': 'hate', '1': 'offensive', '2': 'neither'}


In [17]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [18]:
from symspellpy.symspellpy import SymSpell, Verbosity

from symspellpy.symspellpy import SymSpell, Verbosity

# Initialize SymSpell object
sym_spell = SymSpell(max_dictionary_edit_distance=2)

# Load the frequency dictionary
dictionary_path = r"C:\Users\91781\OneDrive\Desktop\MLProjects\datasets\fdsymspelly.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def correct_text(text):
    corrected_words = []
    for word in text.split():
        # Skip emoji tokens
        if word.startswith("EMOJI_"):
            corrected_words.append(word)
        else:
            suggestions = sym_spell.lookup_compound(word, max_edit_distance=2)
            if suggestions:
                corrected_words.append(suggestions[0].term)
            else:
                corrected_words.append(word)
    return " ".join(corrected_words)



In [19]:
import re
import emoji
import html
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def clean_text_data(text):
    # Lowercase
    text = text.lower()

    # Convert HTML entities
    text = html.unescape(text)

    # Replace emojis with EMOJI_<name> tokens
    text = emoji.demojize(text)  # ":face_with_tears_of_joy:" etc.
    text = re.sub(r":([a-zA-Z0-9_]+):", r" EMOJI_\1 ", text)  # Preserve as single token

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove emails and URLs
    text = re.sub(r"(https?://\S+|www\.\S+|\S+@\S+\.\S+)", " ", text)

    # Remove mentions and RT
    text = re.sub(r"(@\w+|rt)", " ", text)

    # Remove hashtags (optional)
    text = re.sub(r"#", "", text)

    # Correct spellings
    text = correct_text(text)

    # Remove punctuation except underscore (EMOJI tokens use underscore)
    text = re.sub(r"[^a-zA-Z0-9_\s]", " ", text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [w for w in words if w not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]

    # Rejoin
    return " ".join(words)



In [20]:
tweets = [
    # Hate
    "I can't stand those people, they ruin everything!",
    "These losers should just disappear from the internet forever, total trash",
    "People like them make me sick, they should be erased from society.",
    "you're fucking gay, blacklisted hoe"" Holding out for #TehGodClan anyway http://t.co/xUCcwoetmn",
    "I hate those idiots 😡 they ruin everything!"
    "People like him should just disappear from this world 💀",
    "I can't stand those losers 😤, they make me sick!",
    "These trash humans have no place here 👎",
    "I despise everyone who supports that garbage 🤮",
    
    # Offensive
    "fuck no that bitch dont even suck dick 😆😆😆 the Kermit videos bout to fuck IG up",
    "got ya bitch tip toeing on my hardwood floors 😆 http://t.co/cOU2WQ5L4q",
    "fuck that guy, he can rot in hell 😒",
    
    # Neither
    "Going for a walk in the park today, feeling so relaxed and happy",
    "I just watched a movie and it was amazing, highly recommend it to everyone"
]


In [21]:
preprocessed_tweets = [clean_text_data(tweet) for tweet in tweets]
preprocessed_tweets

['stand people ruin everything',
 'loser disappear internet forever total trash',
 'people like make sick erased society',
 'fucking gay blacklisted hoe holding ocean anyway',
 'hate idiot EMOJI_enraged_face ruin everything people like disappear world EMOJI_skull',
 'stand loser EMOJI_face_with_steam_from_nose make sick',
 'trash human place EMOJI_thumbs_down',
 'despise everyone supp garbage EMOJI_face_vomiting',
 'fuck bitch done even suck dick EMOJI_grinning_squinting_face EMOJI_grinning_squinting_face EMOJI_grinning_squinting_face kermit video bout fuck',
 'got bitch tip toeing hardwood floor EMOJI_grinning_squinting_face',
 'fuck guy rot hell EMOJI_unamused_face',
 'going walk park today feeling relaxed happy',
 'watched movie amazing highly recommend everyone']

In [22]:
sequences = tokenizer.texts_to_sequences(preprocessed_tweets)
sequences

[[350, 44, 2844, 312],
 [1307, 6925, 995, 744, 1463, 14],
 [44, 4, 24, 416, 10564, 3301],
 [15, 167, 7579, 2, 1667, 2043, 723],
 [38, 1301, 489, 2844, 312, 44, 4, 6925, 227, 163],
 [350, 1307, 680, 24, 416],
 [14, 938, 380, 1241],
 [6054, 301, 458, 1126],
 [8, 1, 43, 49, 188, 58, 1825, 1825, 1825, 4609, 369, 61, 8],
 [10, 1, 884, 4610, 7566, 1051, 1825],
 [8, 105, 5119, 183, 100],
 [91, 356, 736, 124, 300, 6785, 149],
 [1326, 422, 1319, 2671, 4326, 301]]

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
input_sequences=np.array(pad_sequences(sequences,maxlen=max_len,padding='pre'))
input_sequences

array([[    0,     0,     0, ...,    44,  2844,   312],
       [    0,     0,     0, ...,   744,  1463,    14],
       [    0,     0,     0, ...,   416, 10564,  3301],
       ...,
       [    0,     0,     0, ...,  5119,   183,   100],
       [    0,     0,     0, ...,   300,  6785,   149],
       [    0,     0,     0, ...,  2671,  4326,   301]])

In [24]:
pred=model.predict(input_sequences)
pred_classes=np.argmax(pred,axis=1)
pred_labels = [class_labels[str(i)] for i in pred_classes]

pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 751ms/step


array([[3.8190141e-02, 6.5621728e-01, 3.0559263e-01],
       [2.4761833e-02, 8.7388575e-02, 8.8784957e-01],
       [9.3405806e-02, 3.2226607e-01, 5.8432811e-01],
       [3.8746339e-01, 6.0858566e-01, 3.9508999e-03],
       [8.1408903e-02, 7.1803296e-01, 2.0055816e-01],
       [5.9920214e-02, 6.4313376e-01, 2.9694602e-01],
       [1.5731899e-01, 2.1676639e-01, 6.2591463e-01],
       [7.2147250e-02, 7.6640093e-01, 1.6145182e-01],
       [1.6826520e-02, 9.8317242e-01, 1.0813905e-06],
       [8.3327359e-03, 9.9148059e-01, 1.8666331e-04],
       [7.1245737e-02, 9.1828722e-01, 1.0467026e-02],
       [7.1846410e-03, 2.0224769e-01, 7.9056770e-01],
       [2.3608976e-03, 2.7406814e-02, 9.7023231e-01]], dtype=float32)

In [25]:
for tweet, label in zip(tweets, pred_labels):
    print(f"Tweet: {tweet}")
    print(f"Predicted Label: {label}")
    print("-" * 50)


Tweet: I can't stand those people, they ruin everything!
Predicted Label: offensive
--------------------------------------------------
Tweet: These losers should just disappear from the internet forever, total trash
Predicted Label: neither
--------------------------------------------------
Tweet: People like them make me sick, they should be erased from society.
Predicted Label: neither
--------------------------------------------------
Tweet: you're fucking gay, blacklisted hoe Holding out for #TehGodClan anyway http://t.co/xUCcwoetmn
Predicted Label: offensive
--------------------------------------------------
Tweet: I hate those idiots 😡 they ruin everything!People like him should just disappear from this world 💀
Predicted Label: offensive
--------------------------------------------------
Tweet: I can't stand those losers 😤, they make me sick!
Predicted Label: offensive
--------------------------------------------------
Tweet: These trash humans have no place here 👎
Predicted Labe