In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
import pickle

In [4]:
df_test   = pd.read_csv("../data/test.csv")
df_labels = pd.read_csv("../data/test_labels.csv")

In [5]:
with open("../models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_test_seq = tokenizer.texts_to_sequences(df_test['comment_text'].fillna(""))
X_test = pad_sequences(X_test_seq, maxlen=150)

In [7]:
model = load_model("../models/best_model.h5")



In [8]:
y_pred = model.predict(X_test)

[1m4787/4787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 12ms/step


In [9]:
y_pred_binary = (y_pred > 0.5).astype(int)

In [10]:
valid_rows = df_labels[df_labels['toxic'] != -1].index

y_true = df_labels.loc[valid_rows, ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
y_eval = y_pred_binary[valid_rows]

In [11]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_eval, target_names=[
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]))


               precision    recall  f1-score   support

        toxic       0.58      0.72      0.64      6090
 severe_toxic       0.36      0.11      0.17       367
      obscene       0.61      0.71      0.66      3691
       threat       0.00      0.00      0.00       211
       insult       0.59      0.50      0.54      3427
identity_hate       0.89      0.01      0.02       712

    micro avg       0.59      0.61      0.60     14498
    macro avg       0.51      0.34      0.34     14498
 weighted avg       0.59      0.61      0.57     14498
  samples avg       0.06      0.06      0.06     14498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
comments = [
    "You are a disgusting person and a disgrace.",
    "I love your work. Thank you!",
    "I'm going to find you and hurt you.",
]

seq = tokenizer.texts_to_sequences(comments)
padded = pad_sequences(seq, maxlen=150)
preds = model.predict(padded)

for comment, pred in zip(comments, preds):
    print(f"\nComment: {comment}")
    for label, prob in zip(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], pred):
        print(f"{label:15}: {prob:.2f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step

Comment: You are a disgusting person and a disgrace.
toxic          : 0.66
severe_toxic   : 0.10
obscene        : 0.44
threat         : 0.04
insult         : 0.42
identity_hate  : 0.06

Comment: I love your work. Thank you!
toxic          : 0.10
severe_toxic   : 0.00
obscene        : 0.03
threat         : 0.00
insult         : 0.03
identity_hate  : 0.00

Comment: I'm going to find you and hurt you.
toxic          : 0.07
severe_toxic   : 0.00
obscene        : 0.01
threat         : 0.00
insult         : 0.01
identity_hate  : 0.00


In [15]:
# Save trained model
model.save("../models/final_model.h5")

# Save tokenizer
import pickle
with open("../models/final_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


