## Run pre-trained model on NIST

In [None]:
from __future__ import annotations

from pprint import pprint

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device('cuda:1')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('climabench/miniLM-cdp-all')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')

model.eval()
model.to(device)
torch.set_grad_enabled(False)

In [None]:
all_qs = (
    pd.read_csv('CDP/Cities/Cities Responses/combined.csv', low_memory=False, usecols=['Question Name'])['Question Name']
    .unique().tolist()
)

num_qs = len(all_qs)
print(num_qs)

q_to_index = {q: i for i, q in enumerate(all_qs)}

In [None]:
def process_output(response: str, outputs, true_q: str | None = None) -> None:
    top5_ids = outputs.argsort(descending=True)[:5]
    top5_qs = {round(torch.sigmoid(outputs[i]).item(), 4): all_qs[i] for i in top5_ids}
    print(f'Response:\t{response}')
    if true_q is not None:
        print(f'True Q:\t\t{true_q}')
    pprint(top5_qs, width=500, sort_dicts=False)

In [None]:
passages = pd.read_csv('nist/AlamedaCA_carp_final_091119.csv', header=None)[0].tolist()

In [None]:
for passage in passages:
    batch = tokenizer(all_qs, [passage] * num_qs, padding='longest', truncation=True,
                      return_tensors='pt', max_length=512, return_token_type_ids=True)
    for k in ('input_ids', 'attention_mask', 'token_type_ids'):
        batch[k] = batch[k].to(device, non_blocking=True)
    outputs = model(**batch).logits.cpu().squeeze()
    print('============================')
    process_output(passage, outputs)