In [1]:
import logging
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer
from datasets import Dataset, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from loading import load_train
from preprocessing import preprocess
from evaluation import evaluate
from bert import tokenize, get_BERT

[nltk_data] Downloading package stopwords to /Users/jonas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/jonas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jonas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [5]:
EVAL_FRAC = 0.2

MODEL = 'base-small'
TOKENIZER = 'bert-base-uncased'
PREPROCESSING = None

In [6]:
df_train = pd.read_csv('clustering+bert/train.csv')
df_eval = df_train.sample(frac=EVAL_FRAC)
df_eval

Unnamed: 0.1,Unnamed: 0,text,label,cluster
146157,146157,"<user> just a bitta maroon 5 , the wanted and ...",0.0,6
97382,97382,hate when people say friends ? if u aint blood...,1.0,4
150133,150133,s / o to <user> for being my sissy and always ...,1.0,6
93584,93584,<user> yeahhh . jokes .. i love that team but ...,1.0,4
95594,95594,blue at the mizzen ( aubrey-maturin the brand ...,0.0,1
...,...,...,...,...
64136,64136,i should never have to be up this early for wo...,0.0,4
9745,9745,: o how can u say someone resembles a failed a...,0.0,6
19093,19093,<user> should fwm\n,1.0,4
184613,184613,<user> i can't even move its sooo bad and mum ...,0.0,4


In [15]:
preprocess(df_eval, flags=PREPROCESSING, x_col='text')

In [14]:
CLUSTERS = df_eval['cluster'].unique()
CLUSTERS

array([6, 4, 1, 2, 0, 3, 5])

In [8]:
model = get_BERT(MODEL, device)

In [9]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return evaluate(labels, predictions)

In [None]:
def evaluate_cluster(cluster: int, df_eval: pd.DataFrame) -> float:
  dataset_eval = Dataset.from_pandas(df_eval[df_eval['cluster'] == cluster])

  new_features = dataset_eval.features.copy()
  new_features['label'] = ClassLabel(names=['0', '1'])

  dataset_eval = dataset_eval.cast(new_features)

  tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
  eval_tokenized = tokenize(dataset_eval, tokenizer)

  trainer = Trainer(model, eval_dataset=eval_tokenized, tokenizer=tokenizer, compute_metrics=compute_metrics)
  return trainer.evaluate()['accuracy']