In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re

# First dataset: All memes

In [None]:
DATASET_PATH = 'MMHS150K/MMHS150K.csv'

dataset = pd.read_csv(DATASET_PATH)
label_dict = {0: 'No hate speech', 1: 'Racist', 2: 'Sexist', 3: 'Homophobe', 4: 'Religion', 5: 'Other hate'}

# is there a text on the image?
dataset['text_in_image'] = dataset['img_text'].isna().apply(lambda x: not x)

display(dataset)

In [None]:
# Count the number of each label
label_count = np.zeros(6)

# Go trough each row of the dataset
for index, row in dataset.iterrows():
    # Get the label of the current row
    labels = row['labels'][1:-1].split(',')
    
    for label in labels:
        label_count[int(label)] += 1

In [None]:
# Plot the number of each label in a camembert
fig, ax = plt.subplots()
ax.pie(label_count, labels=label_dict.values(), autopct='%1.1f%%')
ax.axis('equal')
ax.set_title('Number of each label in the dataset')
plt.show()

Due to very unbalanced class, we will work with a binary label: Hate or No Hate.

Then, we still have 3 labels per tweet, do we do a majority vote to decide the label of the tweet? Or do we consider the tweet as a multi-label classification problem?

In [None]:
# Plot on the left the barplot of hate_speech and on the right of binary_hate
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
sns.countplot(x='hate_speech', data=dataset, ax=ax[0])
ax[0].set_title('Number of each hate_speech value')
ax[0].set_xticklabels(sorted([f'{i:.2f}' for i in dataset["hate_speech"].unique()]))
sns.countplot(x='binary_hate', data=dataset, ax=ax[1])
ax[1].set_title('Number of each binary_hate value')
plt.show()

If we compute an average of the 3 labels, we can have a value between 0 and 1, but as seen in the previous plots, the average get only one out of mainly 4 values: 0, 0.33, 0.67, 1, which are not at all balanced. We decide to do a majority vote to decide the label of the tweet.

Let's check the distribution in the split datasets.

In [None]:
print('Size of the dataset:', dataset.shape[0])
print('Size of the training set:', dataset[dataset['split'] == 'train'].shape[0])
print('Size of the validation set:', dataset[dataset['split'] == 'val'].shape[0])
print('Size of the test set:', dataset[dataset['split'] == 'test'].shape[0])

In [None]:
# Separate the data into train, test and val and plot binary_hate
train = dataset[dataset['split'] == 'train']
test = dataset[dataset['split'] == 'test']
val = dataset[dataset['split'] == 'val']

fig, ax = plt.subplots(1, 3, figsize=(18, 6))
sns.countplot(x='binary_hate', data=train, ax=ax[0])
ax[0].set_title('Train')
sns.countplot(x='binary_hate', data=test, ax=ax[1])
ax[1].set_title('Test')
sns.countplot(x='binary_hate', data=val, ax=ax[2])
ax[2].set_title('Validation')
plt.show()

Will have to rebalance the train set, or work with only a part of it.

Let's look at the presence of text in the images.

In [None]:
# How many tweets have text on the image (img_text not NaN)
print('Number of tweets with text on the image:', dataset['img_text'].count())
print('Number of tweets without text on the image:', len(dataset) - dataset['img_text'].count())
print(f'Percentage of tweets with text on the image: {dataset["img_text"].count() / len(dataset) * 100:.2f}%')

In [None]:
# For the three datasets, separate in hate and no hate and plot number of tweets with text on the image
fig, ax = plt.subplots(3, 3, figsize=(18, 12))

# Train
sns.countplot(x='text_in_image', data=train[train['binary_hate'] == 0], ax=ax[0][0])
ax[0][0].set_title('Train - No hate')
sns.countplot(x='text_in_image', data=train[train['binary_hate'] == 1], ax=ax[0][1])
ax[0][1].set_title('Train - Hate')
sns.countplot(x='text_in_image', data=train, ax=ax[0][2])
ax[0][2].set_title('Train - All')

# Test
sns.countplot(x='text_in_image', data=test[test['binary_hate'] == 0], ax=ax[1][0])
ax[1][0].set_title('Test - No hate')
sns.countplot(x='text_in_image', data=test[test['binary_hate'] == 1], ax=ax[1][1])
ax[1][1].set_title('Test - Hate')
sns.countplot(x='text_in_image', data=test, ax=ax[1][2])
ax[1][2].set_title('Test - All')

# Validation
sns.countplot(x='text_in_image', data=val[val['binary_hate'] == 0], ax=ax[2][0])
ax[2][0].set_title('Validation - No hate')
sns.countplot(x='text_in_image', data=val[val['binary_hate'] == 1], ax=ax[2][1])
ax[2][1].set_title('Validation - Hate')
sns.countplot(x='text_in_image', data=val, ax=ax[2][2])
ax[2][2].set_title('Validation - All')

plt.tight_layout()
plt.show()

In [None]:
# Compute percentage of tweets with text on the image for each of the previous plots and do heatmap
percentage_img_text = np.zeros((3, 3))

# Train
percentage_img_text[0][0] = train[train['binary_hate'] == 0]['img_text'].count() / len(train[train['binary_hate'] == 0]) * 100
percentage_img_text[0][1] = train[train['binary_hate'] == 1]['img_text'].count() / len(train[train['binary_hate'] == 1]) * 100
percentage_img_text[0][2] = train['img_text'].count() / len(train) * 100

# Test
percentage_img_text[1][0] = test[test['binary_hate'] == 0]['img_text'].count() / len(test[test['binary_hate'] == 0]) * 100
percentage_img_text[1][1] = test[test['binary_hate'] == 1]['img_text'].count() / len(test[test['binary_hate'] == 1]) * 100
percentage_img_text[1][2] = test['img_text'].count() / len(test) * 100

# Validation
percentage_img_text[2][0] = val[val['binary_hate'] == 0]['img_text'].count() / len(val[val['binary_hate'] == 0]) * 100
percentage_img_text[2][1] = val[val['binary_hate'] == 1]['img_text'].count() / len(val[val['binary_hate'] == 1]) * 100
percentage_img_text[2][2] = val['img_text'].count() / len(val) * 100

# Plot the heatmap
fig, ax = plt.subplots()
sns.heatmap(percentage_img_text, annot=True, xticklabels=['No hate', 'Hate', 'All'], yticklabels=['Train', 'Test', 'Validation'])
ax.set_title('Percentage of tweets with text on the image')
plt.show()

# Second dataset: keeping only 'real' memes

We will keep only the memes that have text in them.

In [None]:
# Load second dataset (with tweet with text on the image)
DATASET_PATH = 'MMHS150K/MMHS150K_text_in_image.csv'
dataset2 = pd.read_csv(DATASET_PATH)

In [None]:
# Count the number of each label
label_count = np.zeros(6)

# Go trough each row of the dataset
for index, row in dataset2.iterrows():
    # Get the label of the current row
    labels = row['labels'][1:-1].split(',')
    
    for label in labels:
        label_count[int(label)] += 1
        
# Plot the number of each label in a camembert
fig, ax = plt.subplots()
ax.pie(label_count, labels=label_dict.values(), autopct='%1.1f%%')
ax.axis('equal')
ax.set_title('Number of each label in the dataset')
plt.show()

In [None]:
# Plot binary_hate for the new dataset
fig, ax = plt.subplots()
sns.countplot(x='binary_hate', data=dataset2, ax=ax)
ax.set_title('Number of each binary_hate value for the new dataset')
plt.show()

In [None]:
# Plot distribution of length of tweet_text for each label binary_hate
fig, ax = plt.subplots()
sns.histplot(dataset2[dataset2['binary_hate'] == 0]['tweet_text_clean'].str.len(), ax=ax, color='blue', label='No hate')
sns.histplot(dataset2[dataset2['binary_hate'] == 1]['tweet_text_clean'].str.len(), ax=ax, color='red', label='Hate')
ax.set_title('Distribution of length of tweet_text for each label binary_hate')
plt.legend()
plt.show()

In [None]:
# Compute number of words in img_text
dataset2['nb_words_img_text'] = dataset2['img_text'].apply(lambda x: len(x.split()))

print("95th percentile of number of words in img_text:", dataset2['nb_words_img_text'].quantile(0.95))

dataset3 = dataset2[dataset2['nb_words_img_text'] < dataset2['nb_words_img_text'].quantile(0.95)]

# Box plot of number of words in img_text for each label binary_hate
fig, ax = plt.subplots()
sns.boxplot(x='binary_hate', y='nb_words_img_text', data=dataset3, ax=ax)
ax.set_title('Number of words in img_text for each label binary_hate')
plt.show()

In [None]:
display(dataset3)