In [1]:
import logging
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer
from datasets import Dataset, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from train_test_split import select_eval_with_cluster, select_eval
from preprocessing import preprocess
from evaluation import evaluate
from bert import tokenize, get_BERT, prepare_dataset, compute_metrics

[nltk_data] Downloading package stopwords to /home/jonhue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jonhue/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonhue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
MODEL = {
  0: 'models/roberta-baseline-full', # 'models/cluster-0',
  1: 'models/roberta-baseline-full',
  2: 'models/roberta-baseline-full',
  3: 'models/roberta-baseline-full', # 'models/cluster-3',
  4: 'models/roberta-baseline-full', # 'models/cluster-4',
  5: 'models/roberta-baseline-full', # 'models/cluster-5',
  6: 'models/roberta-baseline-full',
}
TOKENIZER = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
PREPROCESSING = None

In [6]:
df_cluster_map = pd.read_csv('clustering+bert/eval.csv')[['index', 'cluster']]
df_cluster_map

Unnamed: 0,index,cluster
0,922648.0,0
1,944379.0,4
2,2182552.0,4
3,786886.0,4
4,1130778.0,3
...,...,...
1249995,1478680.0,2
1249996,1972646.0,4
1249997,1710597.0,5
1249998,1835784.0,4


In [None]:
CLUSTERS = np.sort(df_cluster_map['cluster'].unique())
CLUSTERS

array([0, 1, 2, 3, 4, 5, 6])

In [None]:
df_eval = pd.merge(select_eval(), df_cluster_map, on='index')

In [None]:
df_eval = df_eval.sort_values(by='cluster')

In [None]:
def evaluate_cluster(cluster: int) -> float:
  dataset_eval = prepare_dataset(df_eval[df_eval['cluster'] == cluster], preprocessing=PREPROCESSING)

  tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
  eval_tokenized = tokenize(dataset_eval, tokenizer)

  model = get_BERT(MODEL[cluster], device)
  trainer = Trainer(model, tokenizer=tokenizer)
  eval_pred = trainer.predict(eval_tokenized)
  return eval_pred

In [None]:
preds = []
for cluster in range(7):
  preds.append(evaluate_cluster(cluster))

compute_metrics((np.concatenate([p[0] for p in preds]), df_eval['label']))