In [1]:
import logging
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer
from datasets import Dataset, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from train_test_split import select_eval_with_cluster
from preprocessing import preprocess
from evaluation import evaluate
from bert import tokenize, get_BERT, prepare_dataset, compute_metrics

[nltk_data] Downloading package stopwords to /home/jonhue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jonhue/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonhue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
MODEL = 'models/baseline'
TOKENIZER = 'bert-base-uncased' # 'cardiffnlp/twitter-roberta-base-sentiment-latest'
PREPROCESSING = None

In [6]:
df_cluster_map = pd.read_csv('clustering+bert/eval.csv')[['index', 'cluster']]
df_cluster_map

Unnamed: 0,index,cluster
0,922648.0,0
1,944379.0,4
2,2182552.0,4
3,786886.0,4
4,1130778.0,3
...,...,...
1249995,1478680.0,2
1249996,1972646.0,4
1249997,1710597.0,5
1249998,1835784.0,4


In [7]:
CLUSTERS = df_cluster_map['cluster'].unique()
CLUSTERS

array([0, 4, 3, 5, 2, 1, 6])

In [8]:
model = get_BERT(MODEL, device)

In [None]:
def evaluate_cluster(cluster: int) -> float:
  print(f'====== EVALUATING CLUSTER {cluster} ======')
  df_eval = select_eval_with_cluster(df_cluster_map, cluster)
  dataset_eval = prepare_dataset(df_eval, preprocessing=PREPROCESSING)

  tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
  eval_tokenized = tokenize(dataset_eval, tokenizer)

  trainer = Trainer(model, eval_dataset=eval_tokenized, tokenizer=tokenizer, compute_metrics=compute_metrics)
  metrics = trainer.evaluate()
  print(metrics)
  return metrics

In [None]:
metrics = {}

for cluster in CLUSTERS:
  metrics[cluster] = evaluate_cluster(cluster)



Casting the dataset: 100%|██████████| 4/4 [00:00<00:00, 29.67ba/s]
100%|██████████| 40/40 [00:03<00:00, 10.60ba/s]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: cluster, token_type_ids, __index_level_0__, text, index. If cluster, token_type_ids, __index_level_0__, text, index are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 8


INFO:root:---
* accuracy: 0.8526
* precision: 0.8396406931008823
* recall: 0.8458449346755194
* f1: 0.8427313950386769
---
INFO:root:---
* bce: 0.33770535099356785
* auc: 0.9299977179490486
---




Casting the dataset: 100%|██████████| 4/4 [00:00<00:00, 29.43ba/s]
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/jonhue/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size":

In [None]:
metrics

NameError: name 'metrics' is not defined