In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

from src.utils import ROOT_DIR
import sys
sys.path.append(ROOT_DIR)

from src.dataset import CustomDataset
from src.fcm import FCM
from src.fcm import train_epoch, eval_epoch, acc, f1
import torch

import matplotlib.pyplot as plt
from PIL import Image
import requests
import torch
from src.dataset import CustomDataset
from torch.utils.data import DataLoader
from torchvision import transforms
import pandas as pd
import os

In [None]:
# Load the dataset
DATASET_PATH = os.path.join(ROOT_DIR, 'data', 'MMHS150K', 'MMHS150K_with_img_text.csv')
df = pd.read_csv(DATASET_PATH)

# Load the saved predictions

PREDICTIONS_PATH = os.path.join(ROOT_DIR,'data', 'results','FCM', 'fcm_predictions.json')
OUTPUTS_PATH = os.path.join(ROOT_DIR, 'data','results','FCM', 'fcm_outputs.json')

import json
with open(PREDICTIONS_PATH, 'r') as f:
    predictions = json.load(f)
with open(OUTPUTS_PATH, 'r') as f:
    outputs = json.load(f)
    
# For each index, get the binary_hate and label
pred_keys = [int(k) for k in predictions.keys()]

df_test = df[df["index"].isin(pred_keys)]
df_test["pred"] = [predictions[str(k)] for k in df_test["index"]]
df_test["output"] = [outputs[str(k)] for k in df_test["index"]]

In [None]:
# Compare "binary_hate" and "pred" to get the accuracy and F1 score
acc_score = acc(df_test["binary_hate"], df_test["pred"])
f1_score = f1(df_test["binary_hate"], df_test["pred"])

print(f"Accuracy: {acc_score}")
print(f"F1 score: {f1_score}")

In [None]:
# Plot the ROC curve
from sklearn.metrics import roc_curve, auc
import numpy as np

fpr, tpr, _ = roc_curve(df_test["binary_hate"], df_test["output"])
roc_auc = auc(fpr, tpr)

lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve FCM (area = %0.2f)' % roc_auc)
# Add the random line
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='Random')
plt.legend(loc="lower right")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.show()

In [None]:
# Plot the distribution of the outputs / predictions
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].hist(df_test["output"], bins=50)
ax[0].set_title("Distribution of the outputs")
ax[0].set_xlabel("Output")
ax[0].set_ylabel("Count")

ax[1].hist(df_test["pred"], bins=50)
ax[1].set_title("Distribution of the predictions")
ax[1].set_xlabel("Prediction")
ax[1].set_ylabel("Count")

plt.show()

In [None]:
# Plot the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(df_test["binary_hate"], df_test["pred"])

print("True positive rate: ", cm[1, 1] / (cm[1, 1] + cm[1, 0]))
print("True negative rate: ", cm[0, 0] / (cm[0, 0] + cm[0, 1]))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt

def plot_meme(df, img_dir, index):
    row = df[df["index"] == index]
    text = row["tweet_text_clean"].values[0]
    label = row["binary_hate"].values[0]
    
    image = Image.open(os.path.join(img_dir, str(index) + ".jpg"))
    plt.imshow(image)
    plt.title(f"Label: {label}, Text: {text}")
    plt.axis("off")
    plt.show()
    

In [None]:
IMG_DIR = os.path.join(ROOT_DIR, 'data', 'MMHS150K', "img_resized")

# Using the outputs, get the top 10 most hateful tweets and the top 10 least hateful tweets
df_test["output"] = np.array(df_test["output"])
df_test["index"] = np.array(df_test["index"])

top_10_hateful = df_test.sort_values(by="output", ascending=False).head(10)
top_10_least_hateful = df_test.sort_values(by="output", ascending=True).head(10)

for index in top_10_hateful["index"]:
    plot_meme(df_test, IMG_DIR, index)

In [None]:
for index in top_10_least_hateful["index"]:
    plot_meme(df_test, IMG_DIR, index)