# Goals

To use previously fine-tuned pretrained RoBERTa models to infer the polarity and aspect of ICLR sentences and then to aggregate these up as sentence frequencies describing the full review. 

# Libraries

In [1]:
import pandas as pd
import os
import collections
import json
import random
import glob
import csv
from tqdm import tqdm
import numpy as np
import pickle
import multiprocessing as mp
import stanza
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
import torch
from torch.utils.data import DataLoader
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

# Globals

In [3]:
SENTENCIZE_PIPELINE = stanza.Pipeline("en", processors="tokenize")
Sentence = collections.namedtuple("Sentence", "interval text")
TOKENIZER = RobertaTokenizer.from_pretrained("roberta-base")
SEED = 1
random.seed(SEED)

2024-05-11 21:45:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 48.1MB/s]                    
2024-05-11 21:45:49 INFO: Downloaded file to /home/jupyter/stanza_resources/resources.json
2024-05-11 21:45:49 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2024-05-11 21:45:49 INFO: Using device: cuda
2024-05-11 21:45:49 INFO: Loading: tokenize
2024-05-11 21:45:49 INFO: Loading: mwt
2024-05-11 21:45:49 INFO: Done loading processors!


In [4]:
DIR = os.path.dirname(os.getcwd()) + "/"
INT = DIR + "00_rawData/"
OUT = DIR + "03_labeledData/"

In [5]:
OUT

'/home/jupyter/sandbox/valsInICLR/03_labeledData/'

# Preprocessing and Prediction Functions

# Read in Data

In [6]:
iclr = pd.read_feather(INT+"iclr_2018_2024.feather")
iclr['year'].value_counts().sort_index()

year
2018     2784
2019     4332
2020     6721
2021    10022
2022    10210
2023    14359
2024    28028
Name: count, dtype: int64

In [7]:
iclr.columns

Index(['year', 'forum', 'reviewer', 'reviewer_id', 'review', 'rating',
       'decision', 'len'],
      dtype='object')

# Process Raw Data
Tokenize reviews into sentences, rendor as encoded tensors, and save as torch dataset.

In [8]:
def _chop_into_sentences(text):
    doc = SENTENCIZE_PIPELINE(text)
    sentences = []
    for sentence in doc.sentences:
        try:
            sentence_dict = sentence.to_dict()
            start = sentence_dict[0]["start_char"]
            end = sentence_dict[-1]["end_char"]
            sentences.append(Sentence((start, end), sentence.text))
        except Exception as e:
            # Uncomment to debug issues/errors:
            # print(f"Error tokenizing: {sentence.text}. Error: {e}")
            # print(sentence.text)
            pass
    return sentences


def tokenize_iclr_reviews(input_path):
    """
    Uses spacy's sentence pipeline to tokenize reviews
    into sentences. Returns df where row is review sentence
    """
    output_path = input_path.replace(".feather", "_sents.jsonl")
    reviews = pd.read_feather(input_path)
    with open(output_path, 'w') as out_f:
        for _, review in tqdm(reviews.iterrows(), total=reviews.shape[0], desc="Tokenizing Reviews"):
            review_sentences = _chop_into_sentences(review['review'])
            for i, sentence in enumerate(review_sentences):
                new_line = {
                    "ms_id": review['forum'],
                    "Reviewer_ID": review['reviewer_id'], 
                    "identifier": f"{review['reviewer_id']}|||{i}",
                    "sentence": f"{sentence.text}",
                }
                out_f.write(json.dumps(new_line)+"\n")

                    
def _process_batch(df):
    """
    Helper; creates encodings for a batched df.
    """
    df['text'] = df['sentence']
    text_list = list(df['text'])

    encodings = {'input_ids': [], 'attention_mask': [], 'identifier': list(df['identifier'])}
    for text in text_list:
        encoded = TOKENIZER(
            text, 
            truncation=True, 
            padding='max_length', 
            max_length=128
        )
        encodings['input_ids'].append(encoded['input_ids'])
        encodings['attention_mask'].append(encoded['attention_mask'])

    return encodings


def _worker(input_path, output_path, TOKENIZER, chunksize):
    """
    A bit convoluted. But it helps speed up encoding by 
    parallel processing using all CPUs.
    """
    reader = pd.read_json(input_path, orient="records", lines=True, chunksize=chunksize)
    all_encodings = {'input_ids': [], 'attention_mask': [], 'identifier': []}
    progress = tqdm(total=2225000)  # hard coded to be n sentences in ICLR
    with mp.Pool(mp.cpu_count()) as pool:  # backend CPU parallelization
        for encodings in pool.imap(_process_batch, [df for df in reader]):
            all_encodings['input_ids'].extend(encodings['input_ids'])
            all_encodings['attention_mask'].extend(encodings['attention_mask'])
            all_encodings['identifier'].extend(encodings['identifier'])
            progress.update(chunksize)
    progress.close()
    pd.DataFrame(all_encodings).to_feather(output_path)

        
def encode_iclr_sents(input_path, tokenizer, chunksize=5000):
    """
    `main` logic.
    """
    output_path = input_path.replace("sents.jsonl", "encodings.feather")
    _worker(input_path, output_path, TOKENIZER, chunksize)

In [9]:
# tokenize_iclr_reviews(input_path=INT+"iclr_2018_2024.feather")

Tokenizing Reviews: 100%|██████████| 76456/76456 [47:29<00:00, 26.83it/s]  


In [9]:
encode_iclr_sents(input_path=INT+"iclr_2018_2024_sents.jsonl", tokenizer=TOKENIZER)

 93%|█████████▎| 2225000/2401686 [01:42<00:08, 21754.55it/s]


# Predict Sentence Labels
For each task ("variable"), call in respective fine-tuned RoBERTa model and predict and write sentence labels to file.

In [10]:
def _load_model(model_dir):
    """
    Gets fine-tuned RoBERTa model
    """
    print("Loading trained model...")
     # Load the model configuration and create an instance
    config = RobertaConfig.from_pretrained(model_dir)
    model = RobertaForSequenceClassification.from_pretrained(model_dir, config=config)
    return model 


def _write_predictions_to_file(model, loader, variable, output_path):
    """
    Predicts labels in batches and writes them to csv batch-wise
    """
    with open(output_path, "w") as f, torch.cuda.amp.autocast():
        writer = csv.writer(f)
        writer.writerow(["identifier", f"{variable}_hat"])
        for i, batch in enumerate(tqdm(loader)):
            inputs = {k: v.to(model.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
            outputs = model(**inputs)
            predictions = outputs.logits.argmax(axis=-1).detach().cpu().numpy()
            identifiers = batch['identifier']
            for identifier, prediction in zip(identifiers, predictions):
                writer.writerow([identifier, prediction])
   
    
def predict_iclr_sentence_labels(input_path, model_dir, variable):
    """
    `main` fucntion
    """
    output_path = input_path.replace("_encodings.feather", f"_{variable}_predictions.csv")
        
    # use GPU
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = _load_model(model_dir).to(DEVICE)
    model.eval()

    # load torch dataset
    dataset = pd.read_feather(input_path)
    dataset.to_dict(orient='list')
    dataset = Dataset.from_dict(dataset)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'identifier'])
    
    # predict + write
    with torch.no_grad():
        loader = DataLoader(dataset, batch_size=64, num_workers=8, pin_memory=True)
        print(f"Predicting {variable}...")
        _write_predictions_to_file(model, loader, variable, output_path)

In [11]:
# {'task': model_prefix}
# we'll use the task to get the right model dir
# we'll use the prefix to get the best preforming model
model_map = {
    "value": 8000, # prefix ~ n examples trained on
    "polarity": 16000,
}
torch.set_grad_enabled(False)
for variable, prefix in model_map.items():
    model_dir = glob.glob(f"/home/jupyter/01_predictions/judgments/{variable}/n_{prefix}*")[0]
    predict_iclr_sentence_labels(INT+"iclr_2018_2024_encodings.feather", model_dir, variable)

Loading trained model...
Predicting value...


100%|██████████| 34713/34713 [21:20<00:00, 27.10it/s]


Loading trained model...
Predicting polarity...


100%|██████████| 34713/34713 [21:22<00:00, 27.07it/s]


# Aggregate
Take disparate csvs with sentence-level predictions, combine them on the sentence level (aspect + polarity), then aggregate up to review level as sentence frequencies.

In [17]:
def _combine_labels():
    label_map = {
        "polarity": {
            0: "None",
            1: "(+)",
            2: "(–)",
        },
        "value": {
            0: "None",
            1: "Clarity", # "clr",
            2: "Consistency", # "mng",
            3: "Novelty", # "org",
            4: "Thoroughness", # "subs",
            5: "Accuracy", # "snd",
            6: "Replicability", # "rep"
        }
    }
    
    predictions_dct = {}
    for variable in "polarity value".split():
        predictions_dct[variable] = pd.read_csv(INT+f"iclr_2018_2024_{variable}_predictions.csv")
        predictions_dct[variable][f"{variable}_hat"] = predictions_dct[variable][f"{variable}_hat"].apply(lambda prediction: label_map[variable][prediction])
        predictions_dct[variable].drop_duplicates(subset="identifier", inplace=True)
        
        
    # merge two variables into one df on identifier
    predicitions_df = predictions_dct['value'].merge(predictions_dct['polarity'],                                                  
                                                     how="left", 
                                                     left_on="identifier", 
                                                     right_on="identifier", 
                                                     validate="one_to_one")

    # impose no aspect when sentence is non-evaluative
    predicitions_df['value_hat'] = np.where(predicitions_df['polarity_hat']=="None", "None", predicitions_df['value_hat'])
    predicitions_df['polarity_hat'] = np.where(predicitions_df['value_hat']=="None", "None", predicitions_df['polarity_hat'])

    # combine labels: "Value (polarity)"
    predicitions_df['value_judgment'] = predicitions_df['value_hat'] + " " + predicitions_df['polarity_hat']

    # get review id, the aggregate key
    predicitions_df['reviewer_id'] = predicitions_df['identifier'].apply(lambda identifier: identifier.split("|||")[0])
    return predicitions_df


def aggregate():
    """
    Finally aggregates the review sentence-level predictions
    into a review-wise frequency array of value judgments. 
    """
   
    df = _combine_labels()
    output_path = OUT+"iclr_2018_2024_labeled.jsonl"
    with open(output_path, "w") as f:
        for review_id, review_df in df.groupby("reviewer_id"):
            review_dct = review_df['value_judgment'].value_counts().to_dict()
            review_dct["reviewer_id"] = review_id
            f.write(json.dumps(review_dct)+"\n")

    df = pd.read_json(output_path, orient="records", lines=True)
    df['ms_id'] = df['reviewer_id'].apply(lambda x: x.split("&&")[0])
    # na does not mean missing but no sents!
    df = df.fillna(0)
    cols = [col for col in df.columns if "(" in col]
    print(df['ms_id'].nunique(),"n manuscripts", sep="\t")
    print(df['reviewer_id'].nunique(), "n reviews", sep="\t")
    print(pd.DataFrame(df[cols].describe()).T.round(2).sort_index())
    print()
    df.to_json(output_path, orient="records", lines=True)

In [18]:
aggregate()

20827	n manuscripts
76453	n reviews
                     count  mean   std  min  25%  50%  75%   max
Accuracy (+)       76453.0  0.77  1.02  0.0  0.0  0.0  1.0   9.0
Accuracy (–)       76453.0  2.50  2.50  0.0  1.0  2.0  4.0  34.0
Clarity (+)        76453.0  0.64  0.86  0.0  0.0  0.0  1.0  12.0
Clarity (–)        76453.0  2.14  2.89  0.0  0.0  1.0  3.0  48.0
Consistency (+)    76453.0  0.17  0.44  0.0  0.0  0.0  0.0   5.0
Consistency (–)    76453.0  0.88  1.29  0.0  0.0  0.0  1.0  19.0
Novelty (+)        76453.0  1.57  1.60  0.0  0.0  1.0  2.0  14.0
Novelty (–)        76453.0  1.26  1.55  0.0  0.0  1.0  2.0  17.0
Replicability (+)  76453.0  0.08  0.31  0.0  0.0  0.0  0.0   7.0
Replicability (–)  76453.0  0.40  0.82  0.0  0.0  0.0  1.0  15.0
Thoroughness (+)   76453.0  0.68  0.92  0.0  0.0  0.0  1.0  11.0
Thoroughness (–)   76453.0  3.50  3.06  0.0  1.0  3.0  5.0  51.0



In [21]:
iclr_labeled = pd.read_json(OUT+"iclr_2018_2024_labeled.jsonl", orient="records", lines=True)

# Merge Labels with Text Data

In [22]:
iclr = iclr.drop_duplicates('reviewer_id')

In [23]:
iclr_labeled = iclr_labeled.merge(iclr, how="left", on=["reviewer_id"], validate="one_to_one")

In [25]:
labels = [col for col in iclr_labeled if "(" in col or "None" in col]
old_cols = list(iclr.columns)
iclr_labeled = iclr_labeled[old_cols+labels]

In [27]:
print(iclr_labeled.sort_values(by="Clarity (–)", ascending=False)["reviewer_id"].iloc[0])
print(iclr_labeled.sort_values(by="Clarity (–)", ascending=False)["Clarity (–)"].iloc[0])
print(iclr_labeled.sort_values(by="Clarity (–)", ascending=False)['review'].iloc[0])

B1e7hs05Km&&2
48
Update after feedback: I would like to thank the authors for huge work done on improving the paper. I appreciate the tight time constrains given during the discussion phase and big steps towards more clear paper, but at the current stage I keep my opinion that the paper is not ready for publication. Also variability of concerns raised by other reviewers does not motivate acceptance.

I would like to encourage the authors to make careful revision and I would be happy to see this work published. It looks very promising. 

Just an example of still unclear parts of the paper: the text between eq. (3) and (4). This describes the proposed method, together with theoretical discussions this is the main part of the paper. As a reader I would appreciate this part being written detailed, step by step.

The paper proposes the Bayesian version of DQN (by replacing the last layer with Bayesian linear regression) for efficient exploration. 

The paper looks very promising because of 

In [28]:
iclr_labeled.to_feather(OUT+"iclr_labeled.feather")