In [1]:
import logging
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer
from datasets import Dataset, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from train_test_split import select_eval_with_cluster, select_eval
from preprocessing import preprocess
from evaluation import evaluate
from bert import tokenize, get_BERT, prepare_dataset, compute_metrics

[nltk_data] Downloading package stopwords to /home/jonhue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jonhue/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonhue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
MODEL = {
  0: 'models/roberta-baseline-full', # 'models/cluster-0',
  1: 'models/roberta-baseline-full',
  2: 'models/roberta-baseline-full',
  3: 'models/roberta-baseline-full', # 'models/cluster-3',
  4: 'models/roberta-baseline-full', # 'models/cluster-4',
  5: 'models/roberta-baseline-full', # 'models/cluster-5',
  6: 'models/roberta-baseline-full',
}
TOKENIZER = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
PREPROCESSING = None

In [6]:
df_cluster_map = pd.read_csv('clustering+bert/eval.csv')[['index', 'cluster']]
df_cluster_map

Unnamed: 0,index,cluster
0,922648.0,0
1,944379.0,4
2,2182552.0,4
3,786886.0,4
4,1130778.0,3
...,...,...
1249995,1478680.0,2
1249996,1972646.0,4
1249997,1710597.0,5
1249998,1835784.0,4


In [None]:
CLUSTERS = np.sort(df_cluster_map['cluster'].unique())
CLUSTERS

array([0, 1, 2, 3, 4, 5, 6])

In [None]:
df = load_test(x_col='text')
df['index'] = df.index.astype(np.int_)
df

Unnamed: 0,text,index
1,sea doo pro sea scooter ( sports with the port...,1
2,<user> shucks well i work all week so now i ca...,2
3,i cant stay away from bug thats my baby\n,3
4,<user> no ma'am ! ! ! lol im perfectly fine an...,4
5,"whenever i fall asleep watching the tv , i alw...",5
...,...,...
9996,had a nice time w / my friend lastnite\n,9996
9997,<user> no it's not ! please stop !\n,9997
9998,not without my daughter ( dvd two-time oscar (...,9998
9999,<user> have fun in class sweetcheeks\n,9999


In [None]:
df_eval = pd.merge(df, df_cluster_map, on='index')
df_eval

Unnamed: 0,text,index,cluster
0,sea doo pro sea scooter ( sports with the port...,1,1
1,<user> shucks well i work all week so now i ca...,2,5
2,i cant stay away from bug thats my baby\n,3,0
3,<user> no ma'am ! ! ! lol im perfectly fine an...,4,5
4,"whenever i fall asleep watching the tv , i alw...",5,0
...,...,...,...
9995,had a nice time w / my friend lastnite\n,9996,0
9996,<user> no it's not ! please stop !\n,9997,3
9997,not without my daughter ( dvd two-time oscar (...,9998,1
9998,<user> have fun in class sweetcheeks\n,9999,0


In [None]:
df_eval = df_eval.sort_values(by='cluster')
df_eval

Unnamed: 0,text,index,cluster
4283,"waking up at 2 am with food poisoning , not my...",4284,0
4237,<user> it's still sunny outside hope it's bett...,4238,0
6470,hopefully radar goes right\n,6471,0
1518,"im going for a shower , try not to miss me wil...",1519,0
1517,<user> you guys need to come to new york it wo...,1518,0
...,...,...,...
7136,"13x15 custom picture frame / poster frame 1 "" ...",7137,6
7855,15x22 custom picture frame / poster frame 1.22...,7856,6
42,05x11 custom picture frame / poster frame 1.37...,43,6
3702,hot : 22x33 custom picture frame / poster fram...,3703,6


In [None]:
df_eval['cluster'].unique()

array([0, 1, 2, 3, 4, 5, 6])

In [None]:
def evaluate_cluster(cluster: int) -> float:
  df = df_eval[df_eval['cluster'] == cluster]
  preprocess(df, flags=PREPROCESSING, x_col='text')
  dataset_eval = Dataset.from_pandas(df)

  tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
  eval_tokenized = tokenize(dataset_eval, tokenizer)

  model = get_BERT(MODEL[cluster], device)
  trainer = Trainer(model, tokenizer=tokenizer)
  eval_pred = trainer.predict(eval_tokenized)
  return eval_pred

In [None]:
preds = []
for cluster in CLUSTERS:
  preds.append(evaluate_cluster(cluster))

pred = np.concatenate([p[0] for p in preds])

In [None]:
pred

array([[ 3.9073832 , -2.5139768 ],
       [-0.6839598 ,  0.77314305],
       [-1.3159345 ,  1.240455  ],
       ...,
       [ 4.130578  , -2.6494563 ],
       [ 4.1334667 , -2.6509888 ],
       [ 4.129686  , -2.649041  ]], dtype=float32)

In [None]:
pred = np.argmax(pred, axis=1)

In [None]:
pred[pred == 0] = -1

In [None]:
df_eval['pred'] = pred
df_eval

Unnamed: 0,text,index,cluster,pred
4283,"waking up at 2 am with food poisoning , not my...",4284,0,-1
4237,<user> it's still sunny outside hope it's bett...,4238,0,1
6470,hopefully radar goes right\n,6471,0,1
1518,"im going for a shower , try not to miss me wil...",1519,0,1
1517,<user> you guys need to come to new york it wo...,1518,0,1
...,...,...,...,...
7136,"13x15 custom picture frame / poster frame 1 "" ...",7137,6,-1
7855,15x22 custom picture frame / poster frame 1.22...,7856,6,-1
42,05x11 custom picture frame / poster frame 1.37...,43,6,-1
3702,hot : 22x33 custom picture frame / poster fram...,3703,6,-1


In [None]:
df_eval = df_eval.sort_values(by='index')
df_eval

Unnamed: 0,text,index,cluster,pred
0,sea doo pro sea scooter ( sports with the port...,1,1,-1
1,<user> shucks well i work all week so now i ca...,2,5,-1
2,i cant stay away from bug thats my baby\n,3,0,1
3,<user> no ma'am ! ! ! lol im perfectly fine an...,4,5,1
4,"whenever i fall asleep watching the tv , i alw...",5,0,-1
...,...,...,...,...
9995,had a nice time w / my friend lastnite\n,9996,0,1
9996,<user> no it's not ! please stop !\n,9997,3,-1
9997,not without my daughter ( dvd two-time oscar (...,9998,1,-1
9998,<user> have fun in class sweetcheeks\n,9999,0,1


In [None]:
prepare_submission(df_eval['pred'].to_numpy())