## Setup

In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tqdm>=4.27
  Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.21.3-py3-none-any.whl (346 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.2/346.2 kB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
Collecting filelock
  Downloading filelock-3.13.1-py3-none-any.whl (11 kB)
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m 

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

In [3]:
device = torch.device("cpu")
device

device(type='cpu')

In [4]:
tokenizer_name = "bert-base-uncased"
model_name = "chreh/bert-discrimination-classifier"

In [5]:
class HateDetector(torch.nn.Module):
    def __init__(self):
        super(HateDetector, self).__init__()
        self.text_model = (
            AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
            .to(device, dtype=torch.float32)
            .train()
        )
        self.output_func = torch.nn.Softmax(dim=-1)

    def forward(self, tokens: torch.Tensor, token_attention_mask: torch.Tensor):
        return self.output_func(
            self.text_model(tokens, attention_mask=token_attention_mask).logits
        )

In [6]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = HateDetector()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [9]:
model.load_state_dict(torch.load("./shared/out.pt"))

<All keys matched successfully>

## Inference Utils

In [10]:
import numpy as np

# 0 - Hate / Offensive, 1 - Neither
MEANINGS = ["Offensive Language", "Neither"]


def get_scores(prompt) -> np.array:
    tokenized_inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        scores = model(
            tokenized_inputs.input_ids.to(device),
            tokenized_inputs.attention_mask.to(device),
        )
    return scores[0].detach().cpu().numpy()


def get_raw_score_meanings(scores: np.ndarray) -> str:
    return ", ".join(
        map(lambda m, score: m + f": {int(score*100)}%", MEANINGS, scores)
    ).strip(", ")


def get_processed_score_meanings(scores: np.ndarray) -> str:
    if scores[0] < 0.5:
        return "This is not a hateful post."
    else:
        return "This is a hateful post."


def pipeline(prompt):
    scores = get_scores(prompt)
    print("Raw scores:")
    print(get_raw_score_meanings(scores))
    print()
    print(get_processed_score_meanings(scores))

## Inference

In [11]:
# check to make sure nothing is nan (a sign of failed endian-conversion)
torch.isnan(model.text_model.bert.embeddings.LayerNorm.weight).nonzero()

tensor([], size=(0, 1), dtype=torch.int64)

In [12]:
prompt = "I love my school."

In [13]:
pipeline(prompt)

Raw scores:
Offensive Language: 18%, Neither: 81%

This is not a hateful post.


In [16]:
pipeline("I truly hate Indian People")

Raw scores:
Offensive Language: 62%, Neither: 37%

This is a hateful post.
