In [3]:
import json
import time
import pickle

import os
import torch
import pandas as pd
from torch import nn

# from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torchvision import transforms
from collections import OrderedDict
from accelerate import Accelerator

import seaborn as sns
import matplotlib.pylab as plt
import numpy as np

import importlib
import nltk

nltk.download('punkt')

# !nvidia-smi

ModuleNotFoundError: No module named 'torchvision'

# Installation

In [None]:
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [2]:
# Install necessary packages

! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb
! pip install bert-score
! pip install evaluate
! pip install transformers -U
! pip install bert-score
! pip install accelerate[torch]
! pip install bertviz

Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[K     |████████████████████████████████| 7.0 MB 5.6 MB/s eta 0:00:01
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.27.2
    Uninstalling transformers-4.27.2:
      Successfully uninstalled transformers-4.27.2
Successfully installed transformers-4.28.1
zsh:1: no matches found: accelerate[torch]
Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[K     |████████████████████████████████| 157 kB 6.5 MB/s eta 0:00:01
Collecting boto3
  Downloading boto3-1.26.129-py3-none-any.whl (135 kB)
[K     |████████████████████████████████| 135 kB 13.3 MB/s eta 0:00:01
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.1-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 14.0 MB/s eta 0:00:

# Configuration

In [4]:
from transformers import utils

utils.logging.set_verbosity_error()

In [23]:
# device = torch.device('cuda:0')
device = torch.device('cpu')

In [27]:
torch.device(device)

device(type='cpu')

In [28]:
workdir = './'

input_filename = 'all_preprocessed_data_with_citations_aspect_scores.csv'
# 'preprocessed_data_no_nans_1k.csv' 

# Full DataSet Size is 15000 (review, paper), but due to GPU cost + time constraints, we use a sub-sample of 5000 reviews
sample_size = 1000 

seed = 282

# Specify Extraction Method used to Generate Training Text from Papers
# Either: (intro, ce_extract, hybrid)
extraction_method = 'hybrid'

# Maximum Token Length of Paper Extracts Used To Train Model
max_paper_extract_length = 1024 # 1024 is the max input size of a BART model
max_review_length = 1024
min_text_length = 100

# Pre-Trained Hugging Face Seq2Seq Transformers Model
pre_trained_model_checkpoint = "facebook/bart-large-cnn"
ner_model_checkpoint = workdir + 'seqlab_final'

# Summarization Task Configuration
summarization_params = {
    "summarization": {
        "early_stopping": True,
        "length_penalty": 2.0, # BART (favor longer sequences)
        "max_length": max_review_length,
        "min_length": min_text_length,
        "no_repeat_ngram_size": 3, # BART default
        "num_beams": 4 # BART default
    }
}

# Dataset Preparation

In [29]:
from datasets import concatenate_datasets, DatasetDict, Dataset, load_dataset
from sklearn.model_selection import train_test_split

# Load input data (post-extraction and pre-processing to downsample paper text)
all_input_df = pd.read_csv(workdir + input_filename)

In [30]:
from transformers import AutoTokenizer

# Load Tokenizer used with the corresponding pre-trained models
ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_checkpoint)

bart_tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint, add_prefix_space=True)

In [31]:
def normalize_mse_columns(df):
    for col in ['rating', 'logCitNum', 'confidence']:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
        
    return df

In [32]:
with open(workdir + 'seqlab_final/config.json', 'r') as infile:
    config_js = json.load(infile)

all_input_df = all_input_df[~all_input_df['ce_extract'].isna()]
all_input_df['tk_labels'] = all_input_df['tk_labels'].apply(eval)
all_input_df['tokens'] = all_input_df['tokens'].apply(eval)
all_input_df['tk_label_nums'] = all_input_df['tk_labels'].apply(lambda r: [config_js['label2id'][lbl] for lbl in r])
all_input_df = all_input_df # [not currently used] filter_on_token_length(df_exp, tokenizer, extraction_method)

all_input_df = normalize_mse_columns(all_input_df)

if sample_size:
    sample_df = all_input_df.sample(sample_size, random_state=seed)
else:
    sample_df = all_input_df

all_input_df.shape

(22882, 38)

In [33]:
use_no_nan = True

In [34]:
input_df_no_nan = all_input_df.copy(deep=True)

input_df_no_nan = input_df_no_nan[input_df_no_nan['decision_label'].notna()]
input_df_no_nan = input_df_no_nan[input_df_no_nan['confidence'].notna()]
input_df_no_nan = input_df_no_nan[input_df_no_nan['logCitNum'].notna()]
input_df_no_nan = input_df_no_nan[input_df_no_nan['rating'].notna()]
input_df_no_nan = input_df_no_nan[~input_df_no_nan['ce_extract'].isna()]

if use_no_nan:
    if sample_size:
        sample_df = input_df_no_nan.sample(sample_size, random_state=seed)
    else:
        sample_df = input_df_no_nan
    

In [35]:
sample_df.shape

(1000, 38)

In [36]:
sample_df[['decision_label', 'confidence', 'logCitNum', 'rating']].describe()

Unnamed: 0,decision_label,confidence,logCitNum,rating
count,1000.0,1000.0,1000.0,1000.0
mean,0.411,0.234833,-0.006162,0.310872
std,0.492261,0.906558,1.180253,0.785742
min,0.0,-2.872757,-2.204442,-2.095345
25%,0.0,-0.64669,-0.808242,-0.015642
50%,0.0,0.466344,0.102059,0.504284
75%,1.0,0.466344,0.802824,1.02421
max,1.0,1.579377,2.938793,2.583987


In [37]:
input_df_no_nan.shape

(5882, 38)

In [38]:
seed = 282

In [39]:
input_dataset = Dataset.from_pandas(sample_df)
input_dataset_dict = input_dataset.train_test_split(test_size=0.20, seed=seed)

## Filter on Token Length & Input

In [40]:
# Filter on token length [not currently used because it decreases the available 
# dataset size for the CE_extraction method]
def filter_on_token_length(df, tokenizer, text_col, text_min=min_text_length, 
                           text_max=max_paper_extract_length, review_min=min_text_length, 
                           review_max=max_review_length):
    
    def test_length_constraints(txt, tokenizer, mn_length, mx_length):
        tokenized_txt = tokenizer(txt, max_length=None, truncation=False)
        num_tokens = len(tokenized_txt['input_ids'])

        return (num_tokens >= mn_length) and (num_tokens <= mx_length)
      
    return df[
              (df[text_col].apply(lambda s: test_length_constraints(s, tokenizer, text_min, text_max))) &
              (df['review'].apply(lambda s: test_length_constraints(s, tokenizer, review_min, review_max)))
           ]

In [41]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

In [42]:
def tokenize_and_align_labels(example, extraction_method='hybrid', device=device):
#     print(device)

    ## Tokenize the paper extract
    if extraction_method == 'intro':
        input_text = example['intro']
    elif extraction_method == 'ce_extract':
        input_text = example['ce_extract']
    elif extraction_method == 'hybrid':
        input_text = example['abstract'] + example['ce_extract']

    model_inputs = bart_tokenizer(
        input_text,
        max_length=max_paper_extract_length,
        padding='max_length',
        truncation=True,
    )

    ## Get Review Tokens and Output
    review_tokenized_outputs = bart_tokenizer(
        example["tokens"], truncation=True, is_split_into_words=True,
        padding='max_length',
        max_length=max_review_length
    )
    all_ner_labels = example["tk_label_nums"]
    word_ids = review_tokenized_outputs.word_ids(0)
    ner_labels = align_labels_with_tokens(all_ner_labels, word_ids)

    # print(len(review_tokenized_outputs['input_ids']))
    # print(len(ner_labels))
    # print(torch.Tensor(review_tokenized_outputs['input_ids']).shape)
    # print(torch.Tensor(ner_labels).shape)
    # print(len(model_inputs['input_ids']))

    model_inputs['labels'] = torch.tensor(review_tokenized_outputs["input_ids"]).to(device)
    model_inputs["ner_labels"] = torch.IntTensor(ner_labels).to(device)
    model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"]).to(device)
    model_inputs['attention_mask'] = torch.tensor(model_inputs['attention_mask']).to(device)

    # print(type(model_inputs['input_ids']))

    return model_inputs

In [43]:
tokenized_dataset_for_reviews = input_dataset_dict.map(lambda s: 
   tokenize_and_align_labels(s, extraction_method=extraction_method, device=device))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [45]:
tokenized_dataset_for_reviews

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels', 'tokens', 'tk_labels', 'title', 'decision', 'abstract', 'intro', 'conference', 'review', 'decision_label', 'intro_len', 'abstract_len', 'review_len', 'ce_extract', 'ce_extract_len', 'hybrid', 'hybrid_len', 'pid', 'rid', 'uid', 'confidence_str', 'rating_str', 'paper_len', 'conclusion', 'conclusion_len', 'url', 'emails', 'source', 'authors', 'confidence', 'rating', 'decision_binary', 'review_no_whitespace', 'CitNum', 'logCitNum', 'tk_label_nums', '__index_level_0__', 'input_ids', 'attention_mask', 'ner_labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['id', 'text', 'labels', 'tokens', 'tk_labels', 'title', 'decision', 'abstract', 'intro', 'conference', 'review', 'decision_label', 'intro_len', 'abstract_len', 'review_len', 'ce_extract', 'ce_extract_len', 'hybrid', 'hybrid_len', 'pid', 'rid', 'uid', 'confidence_str', 'rating_str', 'paper_len', 'conclusion', 'conclusion_len', 'url', 'em

# Load Pre-Trained Model

In [46]:
from dataclasses import dataclass
from typing import Optional, Tuple
from transformers.modeling_outputs import Seq2SeqLMOutput
import torch

@dataclass
class ExtendedSeq2SeqLMOutput(Seq2SeqLMOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    decision_pred: Optional[torch.FloatTensor] = None # new 
    confidence_pred: Optional[torch.FloatTensor] = None # new 
    logCitNum_pred: Optional[torch.FloatTensor] = None # new 
    rating_pred: Optional[torch.FloatTensor] = None # new 
    aspect_pred: Optional[torch.FloatTensor] = None # new
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


In [48]:
train_citation=False
train_decision=False
train_confidence=False
train_rating=False
train_review=True
train_aspect_score=True

citation_loss_scale = 4
decision_loss_scale = 2
rating_loss_scale = 2
confidence_loss_scale = 2
review_loss_scale = 2
aspect_score_loss_scale = 1

model_output_name = 'review_ner'

In [64]:
from transformers import BartForConditionalGeneration
from transformers import BartConfig
import torch
from torch import nn
from transformers.modeling_outputs import Seq2SeqLMOutput
import torch.nn.functional as F
import time

# extend BartForConditionalGeneration with additional layer to decode one additional output field to BartForConditionalGeneration
# how to subclass model and config: https://discuss.huggingface.co/t/subclassing-a-pretrained-model-for-a-new-objective/10521

def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids

class ExtendedBart(BartForConditionalGeneration):
    def __init__(self, config): 
        super(ExtendedBart, self).__init__(config) 
        
#         self.device = device
        self.conv1 = nn.Conv2d(1, 1, (1, 1024))
    
        self.conv1 = nn.Conv2d(1, 1, (1, 1024))
        self.decision_fc1 = nn.Linear(1024, 4096)
        self.decision_fc2 = nn.Linear(4096, 1024)
        self.decision_fc3 = nn.Linear(1024, 1)
        
        self.confidence_fc1 = nn.Linear(1024, 4096)
        self.confidence_fc2 = nn.Linear(4096, 1024)
        self.confidence_fc3 = nn.Linear(1024, 1)

        self.logCitNum_fc1 = nn.Linear(1024, 4096)
        self.logCitNum_fc2 = nn.Linear(4096, 1024)
        self.logCitNum_fc3 = nn.Linear(1024, 1)

        self.rating_fc1 = nn.Linear(1024, 4096)
        self.rating_fc2 = nn.Linear(4096, 1024)
        self.rating_fc3 = nn.Linear(1024, 1)

        self.ner_fc1 = nn.Linear(1024, 4096)
        self.ner_fc2 = nn.Linear(4096, 1024)
        self.ner_fc3 = nn.Linear(1024, 16)

    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        decoder_input_ids = None,
        decoder_attention_mask = None,
        head_mask = None,
        decoder_head_mask = None,
        cross_attn_head_mask = None,
        encoder_outputs = None,
        past_key_values = None,
        inputs_embeds = None,
        decoder_inputs_embeds = None,
        labels = None,
        use_cache = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
        decision_label=None, # new
        confidence=None, # new
        logCitNum=None, # new
        rating=None, # new
        ner_labels=None, # new
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        lm_logits = self.lm_head(outputs[0])
        lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
        
        masked_lm_loss = 0
        decision_pred = None
        confidence_pred = None
        logCitNum_pred = None
        rating_pred = None
        ner_pred = None
        
        hidden_dim = 1024
        seq_len = 1024
            
        if labels is not None:
            batch_size = labels.shape[0]
            print(batch_size)
            
            if train_review:
                loss_fct = nn.CrossEntropyLoss()
                masked_lm_loss += review_loss_scale * loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))    
            
                print(f'Review Loss: {masked_lm_loss}')
                
            metadata_pred = outputs.last_hidden_state.view(batch_size, -1, hidden_dim) # something like (1,837, 1024)
            seq_length_padding = seq_len-metadata_pred.shape[1]
            metadata_pred = F.pad(metadata_pred, (0,0,0, seq_length_padding), "constant", 0)  # zero padding first dimension to 1024
            metadata_pred = metadata_pred.view(batch_size, 1, seq_len,hidden_dim)
            metadata_pred = self.conv1(metadata_pred)
            metadata_pred = metadata_pred.view(batch_size, -1)

            if train_decision:
                # calculate decision probability
                # pad logit to 1024 length
                decision_pred = self.decision_fc0(metadata_pred)
                decision_pred = self.decision_fc1(decision_pred)
                decision_pred = self.decision_fc2(decision_pred)
                decision_pred = self.decision_fc3(decision_pred)
                decision_pred = decision_pred.view(batch_size)
                decision_pred = decision_pred.type(torch.FloatTensor)
                decision_label = decision_label.type(torch.FloatTensor)
                decision_label_loss_fn = nn.BCEWithLogitsLoss()
                decision_label_loss = decision_label_loss_fn(decision_pred, decision_label)
                masked_lm_loss += decision_loss_scale * decision_label_loss

                print(f'Decision Loss: {decision_label_loss}')
                

            if train_confidence: 
                # calculate confidence probability via MSE loss
                confidence_pred = self.confidence_fc0(metadata_pred)
                confidence_pred = self.confidence_fc1(confidence_pred)
                confidence_pred = self.confidence_fc2(confidence_pred)
                confidence_pred = self.confidence_fc3(confidence_pred)
                confidence_pred = confidence_pred.view(batch_size)
                confidence_pred = confidence_pred.type(torch.FloatTensor)
                confidence_label = confidence.type(torch.FloatTensor)
                confidence_label_loss_fn = nn.MSELoss()
                confidence_loss = confidence_label_loss_fn(confidence_pred, confidence_label)
                masked_lm_loss += confidence_loss_scale * confidence_loss
                print(f'Confidence Loss: {confidence_loss}')


            if train_citation:
                # calculate logCitNum probability via MSE loss
                logCitNum_pred = self.logCitNum_fc0(metadata_pred)
                logCitNum_pred = self.logCitNum_fc1(logCitNum_pred)
                logCitNum_pred = self.logCitNum_fc2(logCitNum_pred)
                logCitNum_pred = self.logCitNum_fc3(logCitNum_pred)
                logCitNum_pred = logCitNum_pred.view(batch_size)

                logCitNum_pred = logCitNum_pred.type(torch.FloatTensor)
                logCitNum_label = logCitNum.type(torch.FloatTensor)
                logCitNum_label_loss_fn = nn.MSELoss()
                logCitNum_loss = logCitNum_label_loss_fn(logCitNum_pred, logCitNum_label)
                masked_lm_loss += citation_loss_scale * logCitNum_loss
                print(f'Citation Loss: {logCitNum_loss}')


            if train_rating:
                # calculate rating probability via MSE loss
                rating_pred = self.rating_fc0(metadata_pred)
                rating_pred = self.rating_fc1(rating_pred)
                rating_pred = self.rating_fc2(rating_pred)
                rating_pred = self.rating_fc3(rating_pred)
                rating_pred = rating_pred.view(batch_size)
                rating_pred = rating_pred.type(torch.FloatTensor)
                rating_label = rating.type(torch.FloatTensor)
                rating_label_loss_fn = nn.MSELoss()
                rating_loss = rating_label_loss_fn(rating_pred, rating_label)
                masked_lm_loss += rating_loss_scale * rating_loss
                print(f'Rating Loss: {rating_loss}')


            # calculate per-token aspect score predictions dim: (1024 x 16)
            if train_aspect_score:
    #             print('insight into ner')
                ner_pred = outputs.last_hidden_state.view(batch_size, -1, hidden_dim) # something like (1,837, 1024)
                seq_length_padding = seq_len-ner_pred.shape[1]
                ner_pred = F.pad(ner_pred, (0,0,0, seq_length_padding), "constant", 0)  # zero padding first dimension to 1024
                ner_pred = ner_pred.view(batch_size,seq_len,hidden_dim)

                ner_pred = self.ner_fc1(ner_pred)
                ner_pred = self.ner_fc2(ner_pred)
                ner_pred = self.ner_fc3(ner_pred)

                ner_pred = ner_pred.view(batch_size, 16, seq_len) # outputs (batch_size, # of classes, seq_len)

                ner_labels = ner_labels.view(batch_size, seq_len) # targets: (batch_size, seq_len)

                ner_loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

                ner_pred = ner_pred.to(self.device)
                ner_labels = ner_labels.to(self.device)

                ner_loss = ner_loss_fn(ner_pred, ner_labels)

                masked_lm_loss += aspect_score_loss_scale*ner_loss
                print(f'NER Loss: {ner_loss}')

        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return ExtendedSeq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            decision_pred=decision_pred, # update the eval
            confidence_pred=confidence_pred, # new 
            logCitNum_pred=logCitNum_pred,  # new 
            rating_pred=rating_pred, # new 
            aspect_pred=ner_pred,  # new 
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )

## Load Model

In [65]:
local_checkpoint = True

In [66]:
pre_trained_model_checkpoint = 'outputmodel/review_ner'

pre_trained_model = pre_trained_model_checkpoint if local_checkpoint else "facebook/bart-large-cnn"
print(pre_trained_model)

outputmodel/review_ner


In [67]:
model_output_name

'review_ner'

In [68]:
# extended_cfg = ExtendedBartConfig()
bart_config = BartConfig()
model = ExtendedBart(config=bart_config).from_pretrained(pre_trained_model,
                                                         max_length=max_review_length, 
                                                         min_length=min_text_length,
                                                         task_specific_params=summarization_params)
model = model.to(device)

# Evaluate Model

In [55]:
import numpy as np
import nltk
from nltk import sent_tokenize
import bert_score
import evaluate

rouge_score = evaluate.load("rouge")

# Compute Evaulation Metric on Seq2Seq Prediction
def compute_metrics(eval_pred):
    predictions, labels = eval_pred # can I unpack inputs as third variable

#     print("evaluation")
#     print(predictions.shape)
#     print(labels.shape)

    # Decode generated summaries into text
    decoded_preds = bart_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, bart_tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = bart_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    rouge_results = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Compute BERT Scores
    # https://github.com/Tiiiger/bert_score/blob/master/example/Demo.ipynb
    # https://arxiv.org/pdf/1904.09675.pdf
    unscaled_bert_scores = bert_score.score(decoded_preds, decoded_labels, lang='en', 
                                   verbose=False, rescale_with_baseline=False)
    bs_prec, bs_recall, bs_f1 = (unscaled_bert_scores[0].mean(), 
                                 unscaled_bert_scores[1].mean(), 
                                 unscaled_bert_scores[2].mean())

    # Extract the median scores
    results = {k: round(v, 4) for k, v in rouge_results.items()}
    
    results.update({'Raw_BertScore_F1_mean': bs_f1, 
                    'Raw_BertScore_Recall_mean': bs_recall,
                    'Raw_BertScore_Precision_mean': bs_prec,
                    })
    
    scaled_bert_scores = bert_score.score(decoded_preds, decoded_labels, lang='en', 
                                   verbose=False, rescale_with_baseline=True)
    bs_prec, bs_recall, bs_f1 = (scaled_bert_scores[0].mean(), 
                                 scaled_bert_scores[1].mean(), 
                                 scaled_bert_scores[2].mean())

    results.update({'Scaled_BertScore_F1_mean': bs_f1, 
                    'Scaled_BertScore_Recall_mean': bs_recall,
                    'Scaled_BertScore_Precision_mean': bs_prec,
                    })

    return results

## Train Model

In [56]:
## Hyperparameters
# Selected based on Hugging Face Guide & Confirmed with validation on small samples
num_train_epochs = 5
learning_rate = 5.0e-5
weight_decay = 0.001

In [57]:
# net = nn.DataParallel(model.cuda(), device_ids=[0,1])

In [59]:
from transformers import Seq2SeqTrainingArguments

# Max Batch Size supported by GPU
batch_size = 8

# Show the training loss with every epoch
logging_steps = len(tokenized_dataset_for_reviews["train"]) // batch_size 

model_name = 'outputmodel/review_generation/' + extraction_method + '/' + model_output_name

os.environ["WANDB_DISABLED"] = "true" # disable wandb
args = Seq2SeqTrainingArguments(
    output_dir= workdir + model_name,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    save_strategy='epoch',
    # include_inputs_for_metrics=True, # added for eval to access decision_labels
    label_names=['decision_label', 'confidence', 'logCitNum', 'rating', 'ner_labels'], # needed to feed decision_label into training loop
    save_total_limit=2,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True, 
    generation_max_length=max_review_length, 
    logging_steps=logging_steps,
    report_to=None 
)

In [60]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [61]:
# del model
# del net
# del cuda_model
# del trainer
# 
import gc
gc.collect()

torch.cuda.empty_cache()

In [62]:
tokenized_dataset_for_reviews

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels', 'tokens', 'tk_labels', 'title', 'decision', 'abstract', 'intro', 'conference', 'review', 'decision_label', 'intro_len', 'abstract_len', 'review_len', 'ce_extract', 'ce_extract_len', 'hybrid', 'hybrid_len', 'pid', 'rid', 'uid', 'confidence_str', 'rating_str', 'paper_len', 'conclusion', 'conclusion_len', 'url', 'emails', 'source', 'authors', 'confidence', 'rating', 'decision_binary', 'review_no_whitespace', 'CitNum', 'logCitNum', 'tk_label_nums', '__index_level_0__', 'input_ids', 'attention_mask', 'ner_labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['id', 'text', 'labels', 'tokens', 'tk_labels', 'title', 'decision', 'abstract', 'intro', 'conference', 'review', 'decision_label', 'intro_len', 'abstract_len', 'review_len', 'ce_extract', 'ce_extract_len', 'hybrid', 'hybrid_len', 'pid', 'rid', 'uid', 'confidence_str', 'rating_str', 'paper_len', 'conclusion', 'conclusion_len', 'url', 'em

In [None]:
from transformers import Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq

# Set Up Model Training
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset_for_reviews["train"],
    eval_dataset=tokenized_dataset_for_reviews["test"],
    data_collator=DataCollatorForSeq2Seq(bart_tokenizer, model=model, max_length=max_review_length),
    tokenizer=bart_tokenizer,
    compute_metrics=compute_metrics,
)

# Train Model on Tokenized DataSet
trainer.train()

# Save Model
# save_path = 'outputmodel/review_generation/hybrid/' + model_output_name
# trainer.save_model(workdir + save_path)

8
Review Loss: 3.6264796257019043
NER Loss: 1.1326136589050293
8
Review Loss: 3.9311184883117676
NER Loss: 1.4954568147659302
8
Review Loss: 3.177537202835083
NER Loss: 1.116485834121704
8
Review Loss: 3.500415325164795
NER Loss: 0.7443220019340515
8
Review Loss: 4.109851837158203
NER Loss: 1.1411259174346924
8
Review Loss: 4.200204372406006
NER Loss: 1.7574920654296875
8
Review Loss: 2.9196300506591797
NER Loss: 1.6662766933441162
8
Review Loss: 3.2835657596588135
NER Loss: 1.189716100692749
8
Review Loss: 2.9775075912475586
NER Loss: 1.2557439804077148
8
Review Loss: 3.8763813972473145
NER Loss: 1.1853622198104858
8
Review Loss: 3.3122775554656982
NER Loss: 1.5808745622634888
8
Review Loss: 3.5920569896698
NER Loss: 1.3825093507766724
8
Review Loss: 1.9725956916809082
NER Loss: 1.4021357297897339
8
Review Loss: 3.3412394523620605
NER Loss: 1.2507399320602417
8
Review Loss: 3.603506565093994
NER Loss: 1.101052165031433
8
Review Loss: 3.5141305923461914
NER Loss: 0.9877867102622986
8
R

# Evaluate Model

In [1]:
model_output_name = 'review_ner'

In [43]:
# Save Model
save_path = workdir + 'outputmodel/' + model_output_name 
# trainer.save_model(workdir + save_path)
save_path

'./outputmodel/review_generation/hybrid/citations_and_metadata'

In [44]:
from transformers import AutoTokenizer
from transformers import AutoModel

tokenizer = AutoTokenizer.from_pretrained(save_path)
model = ExtendedBart.from_pretrained(save_path)
model = model.to(device)

In [45]:
def compute_ner_accuracy(ner_pred, ner_target):
    total = 0
    correct = 0
    for i in range(len(ner_target)):
        if ner_target[i] == -100:
            continue
        total += 1
        correct += (ner_target[i] == ner_pred[i])
    
    return correct, total, correct / float(total)

def compute_all_ner_accuracy(ner_preds, ner_targets):
    eval_total, eval_correct = 0, 0

    for i in range(len(ner_preds)):
        correct, total, ratio = compute_ner_accuracy(ner_preds[i], ner_targets[i])

        eval_total += total
        eval_correct += correct
    
    # print(eval_correct)
    # print(eval_total)
    return (eval_correct / float(eval_total))

In [46]:
# del input_ids
# del outputs
# del attention_mask
# del labels
# del decision_label
# del ner_labels
# del ner_pred
# del target
# del logCitNum
# del confidence
# del rating
# del logCitNum_pred
# del confidence_pred
# del rating_pred

# del decision_preds
# del decision_targets

# del confidence_preds
# del confidence_targets

# del logCitNum_preds
# del logCitNum_targets

# del rating_preds
# del rating_targets

# del ner_preds
# del ner_targets

In [None]:
from sklearn.metrics import roc_auc_score, mean_squared_error

decision_preds = []
decision_targets = []

confidence_preds = []
confidence_targets = []

logCitNum_preds = []
logCitNum_targets = []

rating_preds = []
rating_targets = []

ner_preds = []
ner_targets = []

m = nn.Sigmoid()
for data in tokenized_dataset_for_reviews['test']:
    input_ids = torch.IntTensor([data["input_ids"]]).to(device)
    attention_mask = torch.IntTensor([data["attention_mask"]]).to(device)
    labels = torch.LongTensor([data["labels"]]).to(device)

    decision_label, confidence, logCitNum, rating, ner_labels = None, None, None, None, None

    if train_citation: 
        logCitNum = torch.FloatTensor([data["logCitNum"]]).to(device)
    
    if train_decision:
        decision_label = torch.FloatTensor([data["decision_label"]]).to(device)

    if train_confidence:
        confidence = torch.FloatTensor([data["confidence"]]).to(device)

    if train_rating:
        rating = torch.FloatTensor([data["rating"]]).to(device)

    if train_aspect_score:
        ner_labels = torch.LongTensor([data["ner_labels"]]).to(device)

    outputs = model.forward(input_ids=input_ids, 
                          attention_mask=attention_mask, 
                          labels=labels, 
                          decision_label=decision_label, # new
                          confidence=confidence, # new
                          logCitNum=logCitNum, # new
                          rating=rating, # new
                          ner_labels=ner_labels # new, unused?
                          )

    if train_citation: 
        logCitNum_pred = outputs["logCitNum_pred"].to(device)
        pred = logCitNum_pred.cpu().detach().numpy()[0]
        logCitNum_preds.append(pred)
        target = logCitNum.cpu().detach().numpy()[0]
        logCitNum_targets.append(target)

    if train_decision:
        decision_pred = outputs["decision_pred"].to(device)
        decision_pred = torch.sigmoid(decision_pred)
        pred = decision_pred.cpu().detach().numpy()[0]
        decision_preds.append(pred)
        target = decision_label.cpu().detach().numpy()[0]
        decision_targets.append(target)

    if train_confidence:
        confidence_pred = outputs["confidence_pred"].to(device)
        pred = confidence_pred.cpu().detach().numpy()[0]
        confidence_preds.append(pred)
        target = confidence.cpu().detach().numpy()[0]
        confidence_targets.append(target)

    if train_rating:
        rating_pred = outputs["rating_pred"].to(device)
        pred = rating_pred.cpu().detach().numpy()[0]
        rating_preds.append(pred)
        target = rating.cpu().detach().numpy()[0]
        rating_targets.append(target)
    
    if train_aspect_score:
        ner_pred = outputs['aspect_pred'].to(device)
        ner_pred = ner_pred.cpu().detach()
        ner_pred_label = ner_pred.argmax(axis=1)
        ner_preds.extend(ner_pred_label)

        ner_target = ner_labels.cpu().detach().numpy()
        ner_targets.extend(ner_target)
        

In [None]:
train_citation=True
train_decision=True
train_confidence=False
train_rating=True
train_review=True
train_aspect_score=True

In [None]:
np.sum(np.array(decision_targets) != np.round(np.array(decision_preds)))

In [None]:
np.mean(rating_targets), np.std(confidence_targets)

In [None]:
np.mean(logCitNum_targets), np.std(logCitNum_targets)

In [50]:
if train_citation: 
    print("LogCitation MSE:" + str(mean_squared_error(logCitNum_preds, logCitNum_targets))) # new

if train_decision:
    print("Decision AUC:" + str(roc_auc_score(decision_targets, decision_preds))) # new

if train_confidence:
    print("Confidence MSE:" + str(mean_squared_error(confidence_preds, confidence_targets))) # new 

if train_rating:
    print("Rating MSE:" + str(mean_squared_error(rating_preds, rating_targets))) # new
    
if train_aspect_score:
    print("NER F1 Score:" + str(compute_all_ner_accuracy(ner_preds, ner_targets))) # new

LogCitation MSE:1.4867797
Decision AUC:0.5291560102301791
Confidence MSE:0.6943817
Rating MSE:0.896362


# Section Models

In [None]:
section = 'clarity'
section_model_output_name = model_output_name + '_section'

section_token_col = "section" + "_tokens"
section_token_labels_col = "section" + '_tk_labels' 
section_token_label_nums_col = "section" + '_tk_label_nums'

section_review_length = 256 if section == 'summary' else 128
section_inp_filename = 'all_pp_df_with_as_breakdowns.csv'

sample_size = 200

In [None]:
def preprocess_section_df(df, section):
    def concat_columns(row, column_sfx):
        return row[f'{section}_positive_{column_sfx}'] + row[f'{section}_negative_{column_sfx}']
    
    if section == 'summary':
        filt_df = df[df[f'{section}_length'] > 0]
        filt_df[f'{section}_tokens'] = filt_df[f'{section}_tokens'].apply(eval)
        filt_df[section_token_col] = filt_df[f'{section}_tokens']
    else:
        filt_df = df[(df[f'{section}_positive_length'] > 0) |
                     (df[f'{section}_negative_length'] > 0)
                    ]
    
        filt_df[f'{section}_positive_tokens'] = filt_df[f'{section}_positive_tokens'].apply(eval)
        filt_df[f'{section}_negative_tokens'] = filt_df[f'{section}_negative_tokens'].apply(eval)
        filt_df[section_token_col] = filt_df.apply(lambda row: concat_columns(row, 'tokens'), axis=1)
#     filt_df[section_token_labels_col] = filt_df.apply(lambda row: concat_columns(row, 'tk_labels'), axis=1)

    return filt_df

In [None]:
all_input_df = pd.read_csv(workdir + section_inp_filename)
all_input_df = preprocess_section_df(all_input_df, section)
all_input_df = all_input_df[~all_input_df['ce_extract'].isna()]

if sample_size:
    sample_df = all_input_df.sample(sample_size, random_state=seed)

In [None]:
input_dataset = Dataset.from_pandas(sample_df if sample_size else all_input_df)
input_dataset_dict = input_dataset.train_test_split(test_size=0.20)

In [None]:
def preprocess_function_for_section_review_task(paper_json, extraction_method='hybrid', device=device):

    if extraction_method == 'intro':
        input_text = paper_json['intro']
    elif extraction_method == 'ce_extract':
        input_text = paper_json['ce_extract']
    elif extraction_method == 'hybrid':
        input_text = paper_json['abstract'] + paper_json['ce_extract']

    model_inputs = bart_tokenizer(
        input_text,
        max_length=max_paper_extract_length,
        truncation=True,
    )

    labels = bart_tokenizer(
        paper_json[section_token_col], is_split_into_words=True,
        max_length=section_review_length, truncation=True, padding='max_length'
    )
    model_inputs["labels"] = torch.tensor(labels["input_ids"]).to(device)
    model_inputs['input_ids'] = torch.tensor(model_inputs['input_ids']).to(device)
    # print(len(model_inputs['labels']))
    # print(len(paper_json['text']))
    return model_inputs


tokenized_section_dataset_for_reviews = input_dataset_dict.map(lambda s: 
   preprocess_function_for_section_review_task(s, extraction_method=extraction_method, device=device))

In [None]:
!nvidia-smi

In [None]:
del model
# del net
# del cuda_model
del trainer

import gc
gc.collect()

torch.cuda.empty_cache()

In [None]:
# extended_cfg = ExtendedBartConfig()
bart_config = BartConfig()
model = ExtendedBart(config=bart_config).from_pretrained("facebook/bart-large-cnn",  #pre_trained_model_checkpoint
                                                         max_length=max_review_length, 
                                                         min_length=min_text_length,
                                                         task_specific_params=summarization_params)
model = model.to(device)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_section_dataset_for_reviews["train"],
    eval_dataset=tokenized_section_dataset_for_reviews["test"],
    data_collator=DataCollatorForSeq2Seq(bart_tokenizer, model=model, max_length=max_review_length),
    tokenizer=bart_tokenizer,
    compute_metrics=compute_metrics,
)

# New code - wrap collator in a dictionary
# old_collator = trainer.data_collator
# trainer.data_collator = lambda data: dict(old_collator(data))
# End new code

# trainer.train()

# Train Model on Tokenized DataSet
trainer.train()

In [None]:
save_path = f'outputmodel/review_generation/hybrid/{section}_200_section_model'

In [None]:
# Save Model
trainer.save_model(workdir + save_path)

## Generate Output

In [None]:
model_name

In [None]:
sample_outputs_df = pd.read_csv(workdir + 'all_pp_df_with_as_breakdowns.csv')

sample_outputs_df = sample_outputs_df[
    ((sample_outputs_df['clarity_positive_length'] > 0) | (sample_outputs_df['clarity_negative_length'] > 0)) & 
    (sample_outputs_df['summary_length'] > 0) & 
    ((sample_outputs_df['substance_positive_length'] > 0) | (sample_outputs_df['substance_negative_length'] > 0)) &
    ((sample_outputs_df['soundness_positive_length'] > 0) | (sample_outputs_df['soundness_negative_length'] > 0)) &
    ((sample_outputs_df['originality_positive_length'] > 0) | (sample_outputs_df['originality_negative_length'] > 0))

]

seed = 282

ds_df = sample_outputs_df.sample(5, random_state=seed)

In [None]:
save_path

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
import torch

entries = []
for path in model_paths:
      print(path)

# section = 'summary'
  
    tokenizer = AutoTokenizer.from_pretrained(workdir + path)
    model = AutoModelForSeq2SeqLM.from_pretrained(workdir + path)

    for i, row in ds_df.iterrows():
        print(i)
        id = row['id']
        extraction_txt = row[extraction_method]

        inputs = tokenizer(extraction_txt, return_tensors="pt", truncation=True).input_ids
        outputs = model.generate(inputs)

        decoded_output = ""
        for output in outputs:
            decoded_output += tokenizer.decode(output, skip_special_tokens=True)

        entry = {
            'id': id,
            'extraction_method': extraction_method,
            'extraction_text': extraction_txt,
            'model_name': ''.join(path.split('/')[3:]),
            'title': row['title'],
            'abstract': row['abstract'],
            'review': row['review'],
            'model_review_output': decoded_output,
            'model_output_length': len(decoded_output.split(' '))
        }

        entries.append(entry)
        print(len(entries))
  
    # df_map[section] = pd.DataFrame(entries)

    del model
    del tokenizer
    gc.collect()

    torch.cuda.empty_cache()

output_df = pd.DataFrame(entries)

In [None]:
output_df

In [None]:
output_df.to_csv('./sample_section_outputs_v4.csv', index=False)

## Generate Text for One Sample

In [None]:
txt = tokenized_dataset_for_reviews['train'][0][extraction_method]

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(workdir + model_name)
inputs = bart_tokenizer(txt, return_tensors="pt", truncation=True).input_ids

model = AutoModelForSeq2SeqLM.from_pretrained(workdir + local_checkpoint)
outputs = model.generate(inputs)

decoded_output = ""
for output in outputs:
    decoded_output += tokenizer.decode(output, skip_special_tokens=True)