In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [None]:
from tensorflow.keras.models import load_model

model1 = load_model("/content/drive/My Drive/stacked-bi-lstm.h5")
model2 = load_model("/content/drive/My Drive/stacked-bi-gru.h5")
model3 = load_model("/content/drive/My Drive/parallel-bi-lstm-bi-gru.h5")


In [None]:
import json
from tensorflow.keras.models import model_from_json

with open("/content/drive/My Drive/clip_model_nn.json", "r") as json_file:
    loaded_model_json = json_file.read()

loaded_model = model_from_json(loaded_model_json)

loaded_model.load_weights("/content/drive/My Drive/clip_model_nn.h5")


In [None]:
def preprocess_text(text):
    text = re.sub(r'[^A-Za-z]', ' ', text)
    text = text.lower()
    return text
tokenizer = Tokenizer()

In [None]:
import pandas as pd
import json
import os

# Load dataset from JSONL file
def load_dataset(file_path):
    dataset = []
    with open(file_path, 'r') as file:
        for line in file:
            example = json.loads(line)
            dataset.append(example)
    return dataset

# Define file paths
train_file = '/content/drive/My Drive/Hateful-memes/data/train.jsonl'
image_directory = '/content/drive/My Drive/Hateful-memes/data/'

train_dataset = load_dataset(train_file)

chosen_examples = train_dataset

# Create DataFrame
data = []
for idx, example in enumerate(chosen_examples):
    text = example['text']
    img_path = os.path.join(image_directory, example['img'])
    label = example['label']
    data.append({'id': idx, 'img_text': text, 'img_path': img_path, 'label': label})

df = pd.DataFrame(data)

# Display DataFrame
print(df.head())

   id                                           img_text  \
0   0   its their character not their color that matters   
1   1  don't be afraid to love again everyone is not ...   
2   2                           putting bows on your pet   
3   3  i love everything and everybody! except for sq...   
4   4  everybody loves chocolate chip cookies, even h...   

                                            img_path  label  
0  /content/drive/My Drive/Hateful-memes/data/img...      0  
1  /content/drive/My Drive/Hateful-memes/data/img...      0  
2  /content/drive/My Drive/Hateful-memes/data/img...      0  
3  /content/drive/My Drive/Hateful-memes/data/img...      0  
4  /content/drive/My Drive/Hateful-memes/data/img...      0  


In [None]:
# Load the BERT model
btokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)  # Change the number of labels accordingly
bert_model.load_state_dict(torch.load("/content/drive/MyDrive/bert_hs.pth"))
bert_model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
btokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Assign weightages
bert_weightage = 0.7
remaining_weightage = 0.3
individual_weightage = remaining_weightage / 3


In [None]:
import random

# Randomly select 20% of the entries from the DataFrame
sample_size = int(len(df) * 0.2)
sample_df = df.sample(n=sample_size, random_state=42)

# Initialize variables for accuracy calculation
correct_predictions = 0
total_samples = len(sample_df)

# Iterate over the sampled DataFrame
for index, row in sample_df.iterrows():
    text = row['img_text']
    label = row['label']

    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    tokenized_text = btokenizer.encode(preprocessed_text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length')
    input_ids = torch.tensor([tokenized_text]).to('cuda' if torch.cuda.is_available() else 'cpu')

    # Make predictions using the ensemble of models
    with torch.no_grad():
        bert_model.eval()
        output = bert_model(input_ids)
        bert_predictions = torch.sigmoid(output.logits).cpu().numpy()
        model1_predictions = model1.predict(pad_sequences(tokenizer.texts_to_sequences([preprocessed_text]), maxlen=128, padding='post', truncating='post'))
        model2_predictions = model2.predict(pad_sequences(tokenizer.texts_to_sequences([preprocessed_text]), maxlen=128, padding='post', truncating='post'))
        model3_predictions = model3.predict(pad_sequences(tokenizer.texts_to_sequences([preprocessed_text]), maxlen=128, padding='post', truncating='post'))

    # Combine predictions from all models using ensemble weights
    ensemble_predictions = (
        bert_weightage * bert_predictions +
        individual_weightage * model1_predictions +
        individual_weightage * model2_predictions +
        individual_weightage * model3_predictions
    )

    # Get the "identity_hate" prediction
    identity_hate_prediction = ensemble_predictions[0][5]

    # Compare with the label in the DataFrame
    if identity_hate_prediction >= 0.5 and label == 1:
        correct_predictions += 1
    elif identity_hate_prediction < 0.5 and label == 0:
        correct_predictions += 1

# Calculate accuracy
accuracy = correct_predictions / total_samples * 100

print(f"Accuracy: {accuracy:.2f}%")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Accuracy: 65.65%


In [None]:
import random

# Randomly select 20% of the entries from the DataFrame
sample_size = int(len(df) * 0.2)
sample_df = df.sample(n=sample_size, random_state=42)

# Initialize variables for accuracy calculation
correct_predictions = 0
total_samples = len(sample_df)

# Iterate over the sampled DataFrame
for index, row in sample_df.iterrows():
    text = row['img_text']
    label = row['label']

    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    tokenized_text = btokenizer.encode(preprocessed_text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length')
    input_ids = torch.tensor([tokenized_text]).to('cuda' if torch.cuda.is_available() else 'cpu')

    # Make predictions using the ensemble of models
    with torch.no_grad():
        bert_model.eval()
        output = bert_model(input_ids)
        bert_predictions = torch.sigmoid(output.logits).cpu().numpy()
        model1_predictions = model1.predict(pad_sequences(tokenizer.texts_to_sequences([preprocessed_text]), maxlen=128, padding='post', truncating='post'))
        model2_predictions = model2.predict(pad_sequences(tokenizer.texts_to_sequences([preprocessed_text]), maxlen=128, padding='post', truncating='post'))
        model3_predictions = model3.predict(pad_sequences(tokenizer.texts_to_sequences([preprocessed_text]), maxlen=128, padding='post', truncating='post'))

    # Combine predictions from all models using ensemble weights
    ensemble_predictions = (
        bert_weightage * bert_predictions +
        individual_weightage * model1_predictions +
        individual_weightage * model2_predictions +
        individual_weightage * model3_predictions
    )

    # Get the "identity_hate" prediction
    identity_hate_prediction = ensemble_predictions[0][5]

    print(ensemble_predictions)
    print(text,identity_hate_prediction,label)

    # Compare with the label in the DataFrame
    if identity_hate_prediction >= 0.5 and label == 1:
        correct_predictions += 1
    elif identity_hate_prediction < 0.5 and label == 0:
        correct_predictions += 1

# Calculate accuracy
accuracy = correct_predictions / total_samples * 100

print(f"Accuracy: {accuracy:.2f}%")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[[0.48257184 0.00326586 0.00945078 0.01303552 0.01588709 0.29928184]]
i'm not racist, racisim is a crime. and crime is for black people 0.29928184 1
[[0.6568234  0.00855101 0.5750095  0.00065807 0.42091084 0.02725489]]
why are there so many female archeologist? because bitches love digging up the past 0.02725489 0
[[0.06536586 0.00028    0.00644115 0.00026953 0.00237836 0.0008503 ]]
i'm starting to understand both the appeal of and the demand for sex robots 0.00085029955 0
[[0.63384545 0.00820547 0.2828629  0.00152879 0.09388088 0.13302241]]
how do you play taliban bingo? b-52..f-16...b-1..a-10 0.13302241 1
[[0.01849454 0.00023068 0.00534264 0.0003662  0.00224072 0.00036242]]
the stocks were a public torture device that immobilized the feet of the punished person 0.00036242444 0
[[0.01259931 0.00021119 0.00529661 0.00017731 0.00216381 0.00027729]]
and that was the last nativity play my son was invited to take part in 0.00