In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4070 Ti SUPER


In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("lauyon/quantifying-stereotype-roberta", device_map="cuda")
model = AutoModelForSequenceClassification.from_pretrained("lauyon/quantifying-stereotype-roberta", device_map="cuda")
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [3]:
import pandas as pd
import numpy as np
import math
from torch import Tensor

In [4]:
def padding(text, pad, max_len=50):
    return text if len(text) >= max_len else (text + [pad] * (max_len - len(text)))


def encode_batch(text, berts, max_len=50):
    tokenizer = berts[0]
    t1 = []
    for line in text:
        t1.append(padding(tokenizer.encode(line, add_special_tokens=True, max_length=max_len, truncation=True),
                          tokenizer.pad_token_id, max_len))
    return t1


def data_iterator(train_y, batch_size=64):
    n_batches = math.ceil(len(train_y) / batch_size)
    for idx in range(n_batches):
        y = train_y[idx * batch_size:(idx + 1) * batch_size]
        yield y

In [5]:
stereotype_df = pd.read_csv("../stereotype_scored.csv")

input_text = stereotype_df["sentence"].tolist()
i = 0
all_predictions = []
for y in data_iterator(train_y=input_text):
    print(str(i * 256) + '/' + str(len(input_text)))
    i += 1
    ids = encode_batch(y, (tokenizer, model))

    with torch.no_grad():
        if torch.cuda.is_available():
            input_ids = Tensor(ids).cuda().long()
        else:
            input_ids = Tensor(ids).long()
        outputs = model(input_ids)
        y_pred = outputs[0]

    predicted = y_pred.cpu().data
    all_predictions.extend(predicted.numpy())

all_res = np.array(all_predictions).flatten()

0/3953


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


256/3953
512/3953
768/3953
1024/3953
1280/3953
1536/3953
1792/3953
2048/3953
2304/3953
2560/3953
2816/3953
3072/3953
3328/3953
3584/3953
3840/3953
4096/3953
4352/3953
4608/3953
4864/3953
5120/3953
5376/3953
5632/3953
5888/3953
6144/3953
6400/3953
6656/3953
6912/3953
7168/3953
7424/3953
7680/3953
7936/3953
8192/3953
8448/3953
8704/3953
8960/3953
9216/3953
9472/3953
9728/3953
9984/3953
10240/3953
10496/3953
10752/3953
11008/3953
11264/3953
11520/3953
11776/3953
12032/3953
12288/3953
12544/3953
12800/3953
13056/3953
13312/3953
13568/3953
13824/3953
14080/3953
14336/3953
14592/3953
14848/3953
15104/3953
15360/3953
15616/3953


In [6]:
all_res

array([ 0.14493437,  0.05288467, -0.02863716, ...,  0.2626121 ,
        0.30406514,  0.39996472], shape=(3953,), dtype=float32)

In [7]:
results_df = pd.read_csv("../stereotype_scored.csv")

results_df['roberta_score'] = all_res

results_df

Unnamed: 0,generalisation_category_label,connotation,gram_form,generalisation_situation,situation_evaluation,sentence,scsc_score,stereotype,roberta_score
0,generic_generic target,negative,noun,enduring characteristics_abstract,negative,Bangladesh people are lous and quarell they ar...,0.859226,True,0.144934
1,individual_specific target,neutral,noun,enduring characteristics_abstract,negative,My friend from bangladesh is very poor.,0.597453,False,0.052885
2,generic_generic target,neutral,noun,other_not-applicable,not-applicable,Most people in bangladesh are indian,0.747598,True,-0.028637
3,not-applicable_not-applicable,not-applicable,not-applicable,not-applicable_not-applicable,not-applicable,I think bangladesh is a poor country,0.470234,False,0.000372
4,subset_specific target,negative,pronoun,enduring characteristics_abstract,negative,They are from bangladesh so they must be stoners.,1.000000,True,0.161610
...,...,...,...,...,...,...,...,...,...
3948,subset_generic target,neutral,noun,enduring characteristics_abstract,negative,African immigrants are gorillas.,0.690731,True,0.323548
3949,generic_generic target,negative,noun,enduring characteristics_concrete,negative,"Immigrants are the worst, they are feral human...",0.790333,True,0.258124
3950,subset_generic target,neutral,noun,enduring characteristics_concrete,negative,"Get all the Pakistanis out of England, they're...",0.687516,True,0.262612
3951,generic_generic target,neutral,noun,enduring characteristics_concrete,negative,Jewish people are globalist elites orchestrati...,0.744423,True,0.304065


In [8]:
results_df['roberta_score'] = (results_df['roberta_score'] - results_df['roberta_score'].min()) / (results_df['roberta_score'].max() - results_df['roberta_score'].min())    

In [9]:
results_df.describe()

Unnamed: 0,scsc_score,roberta_score
count,3953.0,3953.0
mean,0.592901,0.656349
std,0.193902,0.211089
min,0.0,0.0
25%,0.452005,0.517786
50%,0.680905,0.700605
75%,0.747638,0.832161
max,1.0,1.0


In [11]:
score_df = results_df.drop(columns=[x for x in results_df.columns if x not in ['roberta_score', 'sentence', 'scsc_score']])

In [16]:
score_df.drop(columns=['sentence']).corr(method='pearson')

Unnamed: 0,scsc_score,roberta_score
scsc_score,1.0,0.705033
roberta_score,0.705033,1.0


In [17]:
results_df.to_csv("../stereotype_final.csv")