## Setup

In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

In [2]:
device = torch.device("cpu")
device

device(type='cpu')

In [3]:
tokenizer_name = "bert-base-uncased"
model_name = "chreh/bert-discrimination-classifier"

In [4]:
class HateDetector(torch.nn.Module):
    def __init__(self):
        super(HateDetector, self).__init__()
        self.text_model = (
            AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
            .to(device, dtype=torch.float32)
            .train()
        )
        self.output_func = torch.nn.Softmax(dim=-1)

    def forward(self, tokens: torch.Tensor, token_attention_mask: torch.Tensor):
        return self.output_func(
            self.text_model(tokens, attention_mask=token_attention_mask).logits
        )

In [5]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = HateDetector()

In [6]:
model.load_state_dict(torch.load("./out.pt"))

<All keys matched successfully>

## Inference Utils

In [7]:
import numpy as np

# 0 - Hate / Offensive, 1 - Neither
MEANINGS = ["Offensive Language", "Neither"]


def get_scores(prompt) -> np.array:
    tokenized_inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        scores = model(
            tokenized_inputs.input_ids.to(device),
            tokenized_inputs.attention_mask.to(device),
        )
    return scores[0].detach().cpu().numpy()


def get_raw_score_meanings(scores: np.ndarray) -> str:
    return ", ".join(
        map(lambda m, score: m + f": {int(score*100)}%", MEANINGS, scores)
    ).strip(", ")


def get_processed_score_meanings(scores: np.ndarray) -> str:
    if scores[0] < 0.5:
        return "This is not a hateful post."
    else:
        return "This is a hateful post."


def pipeline(prompt):
    scores = get_scores(prompt)
    print("Raw scores:")
    print(get_raw_score_meanings(scores))
    print()
    print(get_processed_score_meanings(scores))

## Inference

In [8]:
# check to make sure nothing is nan (a sign of failed endian-conversion)
torch.isnan(model.text_model.bert.embeddings.LayerNorm.weight).nonzero()

tensor([], size=(0, 1), dtype=torch.int64)

In [9]:
prompt = "I love my school."

In [10]:
pipeline(prompt)

Raw scores:
Offensive Language: 12%, Neither: 87%

This is not a hateful post.


In [11]:
pipeline("Them bad bitches be looking realy tasty today")

Raw scores:
Offensive Language: 99%, Neither: 0%

This is a hateful post.
