In [1]:
%pip install torch
%pip install transformers

import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM
import logging
import pandas as pd
import numpy as np
logging.basicConfig(level=logging.INFO)


properties_df = pd.read_csv('../../data/experiment2/roberta_large_property_probabilities_blank.csv')
properties_df = properties_df.fillna("")
properties_df

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,model,property,category,question,answer,probability
0,roberta_large,purifies water,purpose,An animal that,,
1,roberta_large,enables decomposition,purpose,An animal that,,
2,roberta_large,makes honey,purpose,An animal that,,
3,roberta_large,aerates soil,purpose,An animal that,,
4,roberta_large,pointy ears,biological,An animal that,,
5,roberta_large,long legs,biological,An animal that,,
6,roberta_large,hair,biological,An animal that,,
7,roberta_large,warm blooded,biological,An animal that,,
8,roberta_large,jump,behavior,An animal that,,
9,roberta_large,swim,behavior,An animal that,,


In [2]:

tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForMaskedLM.from_pretrained('roberta-large')
model.eval()

def predict_masked_sent_roberta(text, top_k=5):

    '''
    input: text, top_k = how many top probable words
    output: tuple ([str(word)], [float(weight)])
    '''
    # Tokenize input
    tokenized_text = tokenizer.tokenize(tokenizer.cls_token + text + tokenizer.sep_token)
    print(tokenized_text)
    masked_index = tokenized_text.index("<mask>")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)

    token = []
    weight = []
    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        token_weight = top_k_weights[i].item()
        
        # this is specific to roberta so hard-coded, for bert just token.append(predicted_token)
        token.append(predicted_token[1:])
        weight.append(token_weight)

    return token, weight

Downloading: 100%|██████████| 878k/878k [00:00<00:00, 3.78MB/s]
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 2.52MB/s]
Downloading: 100%|██████████| 482/482 [00:00<00:00, 168kB/s]
Downloading: 100%|██████████| 1.33G/1.33G [00:43<00:00, 33.0MB/s]


In [3]:
result_rows = []

for i, row in properties_df.iterrows():
    prompt = f"{row['question']} {row['property']} is a <mask>."
    answers, probabilities = predict_masked_sent_roberta(prompt, top_k=5)
 

    # Loop through each answer and create a new row with its corresponding probability
    for j in range(len(answers)):
        result_row = {
            'model': row['model'],
            'property': row['property'],
            'category': row['category'],
            'question': row['question'],
            'answer': answers[j],
            'probability': probabilities[j]
        }
       # Append the result row to the list of result rows
        result_rows.append(result_row)

result_df = pd.DataFrame(result_rows)

print(result_df)

# prompt = "...<mask>..."
# predict_masked_sent_roberta(prompt)

['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġpur', 'ifies', 'Ġwater', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġenables', 'Ġdecom', 'position', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġmakes', 'Ġhoney', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġaer', 'ates', 'Ġsoil', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġpoint', 'y', 'Ġears', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġlong', 'Ġlegs', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġhair', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġwarm', 'Ġblood', 'ed', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġjump', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġswim', 'Ġis', 'Ġa', '<mask>', '.', '</s>']
['<s>', 'An', 'Ġanimal', 'Ġthat', 'Ġ', 'Ġchew', 'Ġ

In [4]:
result_df.to_csv('../../data/experiment2/roberta_large_property_probabilities.csv', index=False)