In [1]:
import logging
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer
from datasets import Dataset, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from train_test_split import select_train_with_cluster, select_eval_with_cluster
from preprocessing import preprocess
from evaluation import evaluate
from bert import tokenize, get_BERT, prepare_dataset, compute_metrics

[nltk_data] Downloading package stopwords to /home/jonhue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jonhue/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonhue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment-latest' # 'cardiffnlp/twitter-roberta-base-sentiment-latest'
TOKENIZER = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

In [None]:
df_cluster_map_train = pd.read_csv('clustering+bert/train.csv')[['index', 'cluster']]
df_cluster_map_eval = pd.read_csv('clustering+bert/eval.csv')[['index', 'cluster']]
df_cluster_map = pd.concat([df_cluster_map_train, df_cluster_map_eval])
df_cluster_map

Unnamed: 0,index,cluster
0,157049.0,1
1,2366208.0,2
2,1948945.0,0
3,1684769.0,5
4,2262152.0,1
...,...,...
1249995,1478680.0,2
1249996,1972646.0,4
1249997,1710597.0,5
1249998,1835784.0,4


In [None]:
CLUSTER = 5

In [None]:
df_train = select_train_with_cluster(df_cluster_map, CLUSTER, size=None)
df_eval = select_eval_with_cluster(df_cluster_map, CLUSTER, size=None)
df_train.shape, df_eval.shape

((329028, 4), (329491, 4))

In [None]:
dataset_train = prepare_dataset(df_train, preprocessing=None)
dataset_eval = prepare_dataset(df_eval, preprocessing=None)

Casting the dataset:   0%|          | 0/33 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/33 [00:00<?, ?ba/s]

In [None]:
from transformers import AutoTokenizer
import shutil

try:
  shutil.rmtree('cardiffnlp')
except OSError:
  pass

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
model = get_BERT(MODEL, device)

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

In [None]:
train_tokenized = tokenize(dataset_train, tokenizer)
eval_tokenized = tokenize(dataset_eval, tokenizer)



  0%|          | 0/330 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/330 [00:00<?, ?ba/s]

In [None]:
def get_embedding(input_ids):
  return model.roberta.embeddings.word_embeddings.weight[input_ids].mean(axis=0)

In [None]:
train_embeddings = np.vstack([get_embedding(input_ids).detach().cpu().numpy() for input_ids in tqdm(train_tokenized['input_ids'])])
eval_embeddings = np.vstack([get_embedding(input_ids).detach().cpu().numpy() for input_ids in tqdm(eval_tokenized['input_ids'])])
train_embeddings.shape, eval_embeddings.shape

100%|██████████| 329028/329028 [00:33<00:00, 9951.26it/s]
100%|██████████| 329491/329491 [00:32<00:00, 10045.68it/s]


((329028, 768), (329491, 768))

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(max_depth=6, n_estimators=10000, tree_method='gpu_hist', predictor='gpu_predictor')

In [None]:
xgb_model = xgb_model.fit(train_embeddings, df_train['label'])

In [None]:
y_predict = xgb_model.predict_proba(eval_embeddings)

In [None]:
y_predict.shape

(329491, 2)

In [None]:
compute_metrics((y_predict, df_eval['label']))

INFO:root:---
* accuracy: 0.769201586689773
* precision: 0.775155607131554
* recall: 0.8147183075166043
* f1: 0.79444471474832
---
INFO:root:---
* bce: 0.5315112144547388
* auc: 0.8493862052350207
---


{'accuracy': 0.769201586689773,
 'auc': 0.8493862052350207,
 'bce': 0.5315112144547388,
 'confidence': 0.6717468,
 'confidence_std': 0.06616979,
 'correct_confidence': 0.68343085,
 'correct_confidence_std': 0.060098097,
 'f1': 0.79444471474832,
 'incorrect_confidence': 0.6328067,
 'incorrect_confidence_std': 0.07044319,
 'precision': 0.775155607131554,
 'recall': 0.8147183075166043}