In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4070 Ti SUPER


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("lauyon/quantifying-stereotype-roberta", device_map="cuda")
model = AutoModelForSequenceClassification.from_pretrained("lauyon/quantifying-stereotype-roberta", device_map="cuda")
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [22]:
import pandas as pd
import numpy as np
import math
from torch import Tensor

In [25]:
def padding(text, pad, max_len=50):
    return text if len(text) >= max_len else (text + [pad] * (max_len - len(text)))


def encode_batch(text, berts, max_len=50):
    tokenizer = berts[0]
    t1 = []
    for line in text:
        t1.append(padding(tokenizer.encode(line, add_special_tokens=True, max_length=max_len, truncation=True),
                          tokenizer.pad_token_id, max_len))
    return t1


def data_iterator(train_y, batch_size=64):
    n_batches = math.ceil(len(train_y) / batch_size)
    for idx in range(n_batches):
        y = train_y[idx * batch_size:(idx + 1) * batch_size]
        yield y

In [26]:
stereotype_df = pd.read_csv("../replication/stereotypes.csv")

input_text = stereotype_df["text"].tolist()
i = 0
all_predictions = []
for y in data_iterator(train_y=input_text):
    print(str(i * 256) + '/' + str(len(input_text)))
    i += 1
    ids = encode_batch(y, (tokenizer, model))

    with torch.no_grad():
        if torch.cuda.is_available():
            input_ids = Tensor(ids).cuda().long()
        else:
            input_ids = Tensor(ids).long()
        outputs = model(input_ids)
        y_pred = outputs[0]

    predicted = y_pred.cpu().data
    all_predictions.extend(predicted.numpy())

all_res = np.array(all_predictions).flatten()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


0/5473
256/5473
512/5473
768/5473
1024/5473
1280/5473
1536/5473
1792/5473
2048/5473
2304/5473
2560/5473
2816/5473
3072/5473
3328/5473
3584/5473
3840/5473
4096/5473
4352/5473
4608/5473
4864/5473
5120/5473
5376/5473
5632/5473
5888/5473
6144/5473
6400/5473
6656/5473
6912/5473
7168/5473
7424/5473
7680/5473
7936/5473
8192/5473
8448/5473
8704/5473
8960/5473
9216/5473
9472/5473
9728/5473
9984/5473
10240/5473
10496/5473
10752/5473
11008/5473
11264/5473
11520/5473
11776/5473
12032/5473
12288/5473
12544/5473
12800/5473
13056/5473
13312/5473
13568/5473
13824/5473
14080/5473
14336/5473
14592/5473
14848/5473
15104/5473
15360/5473
15616/5473
15872/5473
16128/5473
16384/5473
16640/5473
16896/5473
17152/5473
17408/5473
17664/5473
17920/5473
18176/5473
18432/5473
18688/5473
18944/5473
19200/5473
19456/5473
19712/5473
19968/5473
20224/5473
20480/5473
20736/5473
20992/5473
21248/5473
21504/5473
21760/5473


In [27]:
all_res

array([ 0.07403408, -0.01353621,  0.15799588, ...,  0.26261213,
        0.3040651 ,  0.39996445], shape=(5473,), dtype=float32)

In [30]:
results_df = pd.read_csv("../replication/stereotypes.csv")

results_df['scsc'] = all_res

results_df

Unnamed: 0,text,scsc
0,Some IT work has been outsourced to workers in...,0.074034
1,Bangladesh exports a lot of garments. You figu...,-0.013536
2,Bangladesh is a country few people know about....,0.157996
3,Bangladesh was once part of India. The people ...,-0.014883
4,I went to Bangladesh last week. Everyone there...,0.078818
...,...,...
5468,African immigrants are gorillas.,0.323548
5469,"Immigrants are the worst, they are feral human...",0.258125
5470,"Get all the Pakistanis out of England, they're...",0.262612
5471,Jewish people are globalist elites orchestrati...,0.304065


In [32]:
results_df['scsc'] = (results_df['scsc'] - results_df['scsc'].min()) / (results_df['scsc'].max() - results_df['scsc'].min())    

In [33]:
results_df.describe()

Unnamed: 0,scsc
count,5473.0
mean,0.590452
std,0.231157
min,0.0
25%,0.411775
50%,0.623142
75%,0.791565
max,1.0


In [34]:
results_df.sample(10)

Unnamed: 0,text,scsc
3667,Gay people are creepy.,0.895371
4375,we expect that nonbinary people are source of ...,0.780669
1555,Somalia is a very poor country.,0.6298
1843,She found herself excelling in psychology beca...,0.263802
3561,we expect that lesbians are just a porn category.,0.851715
2410,My dog is a male labrador He has a penis and i...,0.757313
4636,"Jihad, is a violent time that endangers lives ...",0.621698
3678,all Gay people are not real men.,0.87112
5317,The UK government has implemented anti-discrim...,0.260117
1771,His Hispanic background was a source of pride....,0.792938


In [35]:
results_df.to_csv("stereotype_scsc.csv")