In [1]:
import os

from main import load_datasets

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
root = '../data_2023_06_02'

fever_dir = os.path.join(root, 'preprocessed/FEVER')
pubhealth_dir = os.path.join(root, 'preprocessed/PUBHEALTH')
climate_dir = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

In [3]:
climate_params = {
    # 'dev_size': 150,
    # 'test_size': 150,
    # 'random_state': 88
}

ds_fever, ds_pubhealth, ds_climate, ds_test = load_datasets(fever_dir, pubhealth_dir, climate_dir)

## Evaluate Model

In [4]:
#BERT
model_checkpoint = "../models/BERT_FEVER/checkpoint-4546"
# model_checkpoint = "../models/BERT_PUBHEALTH/checkpoint-262"
# model_checkpoint = "../models/BERT_CLIMATE/checkpoint-62"
# model_checkpoint = "../models/BERT_CLIMATE_V2/best_model"
# model_checkpoint = "../models/BERT_CLIMATE_V2_best_accuracy/best_model"
# model_checkpoint = "../models/BERT_CLIMATE_V2_best_loss/best_model"

#RoBERT
# model_checkpoint = "../models/RoBERTa_FEVER/checkpoint-2273"
# model_checkpoint = "../models/RoBERTa_PUBHEALTH/checkpoint-262"
# model_checkpoint = "../models/RoBERTa_CLIMATE/checkpoint-93"


#SciBERT
# model_checkpoint = "../models/SciBERT_FEVER/checkpoint-4546"
# model_checkpoint = "../models/SciBERT_PUBHEALTH/checkpoint-131"
# model_checkpoint = "../models/SciBERT_CLIMATE/checkpoint-31"

#BioBERT
# model_checkpoint = "../models/BioBERT_FEVER/best_model"
# model_checkpoint = "../models/BioBERT_PUBHEALTH/best_model"
# model_checkpoint = "../models/BioBERT_CLIMATE/best_model"

#ALBERT
# model_checkpoint = "../models/ALBERT_FEVER/best_model"
# model_checkpoint = "../models/ALBERT_PUBHEALTH/best_model"
# model_checkpoint = "../models/ALBERT_CLIMATE/best_model"

## Run predictions

In [22]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import DataLoader
import evaluate
import numpy as np
import pandas as pd

def _evaluate(model, ds, device, metric="accuracy"):
    """
    Args:
        model (pytorch model): model to evaluate
        ds (torch.DataLoader): dataset to evaluate on loaded into pytorch DataLoader obj
        device (torch.device): GPU / CPU
        metric (string): evaluation metrics to use. Defaults to accuracy.
    """
    metric = evaluate.load(metric)
    model.eval()
    predictions = []
    for batch in ds:
        batch = {k: v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=preds, references=batch["labels"])
        predictions = predictions + preds.tolist()
        
    predictions = np.array(predictions)
    metric_val = metric.compute()

    return metric_val, predictions

def _get_misclassified_samples(ds, predictions):
    df = pd.DataFrame(ds)
    df['pred'] = predictions
    df['misclassified'] = df['label'] != df['pred']
    print(df.groupby('label')['misclassified'].value_counts(normalize=True)*100) #misclassified = True
    return df

def evaluate_model(model_checkpoint, ds_test, metric="accuracy"):
    """Evaluate accuracy of saved model on test datasets
    
    Args:
        mdoel_checkpoint (string): path to best model,
        ds_test (DatasetDict): huggingface dataset for fever_test, pubhealth_test, climate_test,
        metric (string): evaluation metrics to use. Defaults to accuracy.
    """

    #===================================================
    # Load Model
    #===================================================
    num_labels = 3 
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(f"Model loaded into {device}")
    model.to(device)

    #===================================================
    # Tokenize dataset
    #===================================================
    print(f"Tokenizing dataset")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    def preprocess_function(samples):
        return tokenizer(samples['claim'], samples['evidence'], 
                         padding=True,
                         truncation='only_second', 
                         max_length=512)

    encoded_ds = ds_test.map(preprocess_function, batched=True)

    # format tokens to fit huggingface language model formats
    encoded_ds = encoded_ds.remove_columns(["claim", "evidence"])
    encoded_ds = encoded_ds.rename_column("label", "labels")
    encoded_ds.set_format("torch")

    #===================================================
    # Evaluate
    #===================================================    
    error_samples = {}
    for ds_name in encoded_ds.keys():
        print(f"Evaluating {ds_name}")
        eval_ds = DataLoader(encoded_ds[ds_name], batch_size=8)
        r, predictions = _evaluate(model, eval_ds, device, metric)
        print(f"Overall Accuracy for {ds_name} :: {r}")

        _df = _get_misclassified_samples(ds_test[ds_name], predictions)
        error_samples[ds_name] = _df.copy()
        
    return error_samples

In [25]:
error_samples = evaluate_model(model_checkpoint, ds_test)

Model loaded into cuda
Tokenizing dataset


                                                                 

Evaluating fever
Overall Accuracy for fever :: {'accuracy': 0.6144614461446145}
label  misclassified
0      False             84.398440
       True              15.601560
1      True             100.000000
2      False             99.939994
       True               0.060006
Name: misclassified, dtype: float64
Evaluating pubhealth
Overall Accuracy for pubhealth :: {'accuracy': 0.3556201550387597}
label  misclassified
0      False             58.263773
       True              41.736227
1      True             100.000000
2      True              60.000000
       False             40.000000
Name: misclassified, dtype: float64
Evaluating climate
Overall Accuracy for climate :: {'accuracy': 0.57}
label  misclassified
0      False             91.578947
       True               8.421053
1      True             100.000000
2      True              60.869565
       False             39.130435
Name: misclassified, dtype: float64


### get statistics of data distribution by labels for each dataset

In [None]:
# ds_fever, ds_pubhealth, ds_climate, ds_test

In [283]:
df = pd.DataFrame(ds_fever['fever_test'])
print(df['label'].value_counts(normalize=True)*100)
print(df['label'].value_counts())

2    33.333333
0    33.333333
1    33.333333
Name: label, dtype: float64
2    3333
0    3333
1    3333
Name: label, dtype: int64


In [287]:
df = pd.DataFrame(ds_pubhealth['validation'])
print(df['label'].value_counts(normalize=True)*100)
print(df['label'].value_counts())

0    59.904762
1    36.190476
2     3.904762
Name: label, dtype: float64
0    629
1    380
2     41
Name: label, dtype: int64


In [289]:
df = pd.DataFrame(ds_climate['train'])
print(df['label'].value_counts(normalize=True)*100)
print(df['label'].value_counts())

0    47.298675
2    34.352701
1    18.348624
Name: label, dtype: float64
0    464
2    337
1    180
Name: label, dtype: int64


In [291]:
df = pd.DataFrame(ds_climate['validation'])
print(df['label'].value_counts(normalize=True)*100)
print(df['label'].value_counts())

0    47.5
2    34.0
1    18.5
Name: label, dtype: float64
0    95
2    68
1    37
Name: label, dtype: int64


In [21]:
df = pd.DataFrame(ds_test['climate'])
print(df['label'].value_counts(normalize=True)*100)
print(df['label'].value_counts())

0    47.647059
2    34.117647
1    18.235294
Name: label, dtype: float64
0    81
2    58
1    31
Name: label, dtype: int64


### Error Analysis on Climate Fever

In [24]:
error_samples.keys()

dict_keys(['fever', 'pubhealth', 'climate'])

In [38]:
_df = error_samples['climate']
_df = _df[_df['misclassified'] == True]

In [39]:
_df[_df['label'] == 1].iloc[1]['evidence']

"The effects of global warming include its effects on human health. There are, however, some positive possible aspects to climate change as well. This could negatively affect the affordability of food and the subsequent health of the population. Floods have short and long term negative implications to peoples' health and well being. These melting glaciers have many social and ecological consequences that directly or indirectly impact the health and well-being of humans."

In [40]:
_df[_df['label'] == 1].iloc[1]['claim']

'receding polar ice caps have little if any negative impact on human health and welfare, and likely a positive benefit'