In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import BertTokenizer,BertForSequenceClassification, BertConfig
from transformers.pipelines import pipeline
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
base_dir = "use_case/hf-bert-20-newsgroups-macroclass-drift-recreation-vldb"

df_train = pd.read_csv(os.path.join(base_dir,"dataset","df_train_0-4.csv"))
df_test = pd.read_csv(os.path.join(base_dir,"dataset","df_test_0-4.csv"))
df_new_unseen = pd.read_csv(os.path.join(base_dir,"dataset","df_new_unseen_0-4.csv"))
df_drifted = pd.read_csv(os.path.join(base_dir,"dataset","df_drifted_5.csv"))

In [4]:
OUTPUT_DIR = "use_case/hf-bert-20-newsgroups-macroclass-drift-recreation-vldb/saved_model/best_model"
CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"
BERT_MODEL = 'bert-base-uncased' # BERT model type

config = BertConfig.from_pretrained(os.path.join(OUTPUT_DIR, CONFIG_NAME), output_hidden_states=True)
model = BertForSequenceClassification.from_pretrained(os.path.join(OUTPUT_DIR), config=config)
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

In [5]:
tokenizer_kwargs = {"padding":"max_length", "truncation":True}

In [6]:
train_id2label = ["Technology", "Sale-Ads", "Politics", "Religion", "Science", "Recreation"]

In [7]:
def extract_embedding_and_predict(model, tokenizer, df):
    
    X = df["text"].tolist() # List of input texts
    Y_original_macro = df["macro_label_id"].tolist() # List of original labels (GT)
    Y_original_macro_names = [train_id2label[l] for l in Y_original_macro]  # List of original labels' names (GT)
    
    Y_original_micro = df["micro_label_id"].tolist() # List of original labels (GT)
    Y_original_micro_names = df["micro_label_name"].tolist() # List of original labels (GT)
    
    E = np.empty((0,768)) # Initialize empty array of embeddings
    Y_predicted = [] # Initialize empty list of predicted labels (IDs)
    Y_predicted_names = [] # Initialize empty list of predicted labels (Names)
    
    
    BATCH_SIZE = 32
    n_batch = len(df)//BATCH_SIZE
    remainer = len(df)%BATCH_SIZE
    
    for i in tqdm(range(n_batch)):
        input_texts = df["text"].iloc[i*BATCH_SIZE:i*BATCH_SIZE+BATCH_SIZE].tolist()
        
        tokenized_texts = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**tokenized_texts.to(device))
            
        batch_probabilities = nn.functional.softmax(outputs["logits"], dim=-1)
        batch_labels = torch.argmax(batch_probabilities, dim=1).tolist()

        batch_probabilities_list = batch_probabilities.tolist()            
        batch_labels_name = [train_id2label[l] for l in batch_labels] 

        Y_predicted.extend(batch_labels)
        Y_predicted_names.extend(batch_labels_name)

        last_layer_hidden_states_arr = outputs["hidden_states"][12].detach().cpu().numpy()                   
        embedding_CLS_arr = last_layer_hidden_states_arr[:, 0, :] # [BATCH_SIZE, 0 = CLS, 768]
        E = np.vstack([E, embedding_CLS_arr])
       
    if remainer>0:
        input_texts = df["text"].iloc[-remainer:].tolist()

        tokenized_texts = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**tokenized_texts.to(device))

        batch_probabilities = nn.functional.softmax(outputs["logits"], dim=-1)
        batch_labels = torch.argmax(batch_probabilities, dim=1).tolist()

        batch_probabilities_list = batch_probabilities.tolist()            
        batch_labels_name = [train_id2label[l] for l in batch_labels] 

        Y_predicted.extend(batch_labels)
        Y_predicted_names.extend(batch_labels_name)

        last_layer_hidden_states_arr = outputs["hidden_states"][12].detach().cpu().numpy()                   
        embedding_CLS_arr = last_layer_hidden_states_arr[:, 0, :] # [BATCH_SIZE, 0 = CLS, 768]
        E = np.vstack([E, embedding_CLS_arr])
        
    return X, E, Y_original_macro, Y_original_micro, Y_original_macro_names, Y_original_micro_names, Y_predicted, Y_predicted_names

In [8]:
X_test, E_test, Y_original_test_macro, Y_original_test_micro, Y_original_names_test_macro, Y_original_names_test_micro, Y_predicted_test, Y_predicted_names_test = extract_embedding_and_predict(model, tokenizer, df_test)





100%|██████████| 105/105 [00:50<00:00,  2.08it/s]


In [9]:
X_train, E_train, Y_original_train_macro, Y_original_train_micro, Y_original_names_train_macro, Y_original_names_train_micro, Y_predicted_train, Y_predicted_names_train = extract_embedding_and_predict(model, tokenizer, df_train)

100%|██████████| 158/158 [01:14<00:00,  2.11it/s]


In [10]:
X_drift, E_drift, Y_original_drift_macro, Y_original_drift_micro, Y_original_names_drift_macro, Y_original_names_drift_micro, Y_predicted_drift, Y_predicted_names_drift = extract_embedding_and_predict(model, tokenizer, df_drifted)

100%|██████████| 114/114 [00:46<00:00,  2.45it/s]


In [None]:
X_new_unseen, E_new_unseen, Y_original_new_unseen_macro, Y_original_new_unseen_micro, Y_original_names_new_unseen_macro, Y_original_names_new_unseen_micro, Y_predicted_new_unseen, Y_predicted_names_new_unseen = extract_embedding_and_predict(model, tokenizer, df_new_unseen)

 57%|█████▋    | 98/173 [00:48<00:34,  2.15it/s]

In [None]:
import h5py

In [None]:
def save_embedding(output_path, X, E, Y_original_macro, Y_original_micro, Y_original_names_macro, Y_original_names_micro, Y_predicted, Y_predicted_names):

    fp = h5py.File(output_path, "w")

    fp.create_dataset("X", data=X, compression="gzip")
    fp.create_dataset("E", data=E, compression="gzip")
    fp.create_dataset("Y_original", data=Y_original_macro, compression="gzip")
    fp.create_dataset("Y_original_names", data=Y_original_names_macro, compression="gzip")
    fp.create_dataset("Y_original_macro", data=Y_original_macro, compression="gzip")
    fp.create_dataset("Y_original_names_macro", data=Y_original_names_macro, compression="gzip")
    fp.create_dataset("Y_original_micro", data=Y_original_macro, compression="gzip")
    fp.create_dataset("Y_original_names_micro", data=Y_original_names_macro, compression="gzip")
    fp.create_dataset("Y_predicted", data=Y_predicted, compression="gzip")
    fp.create_dataset("Y_predicted_names", data=Y_predicted_names, compression="gzip")
    fp.close()
    return

In [None]:
embedding_dir = os.path.join(base_dir, "saved_embedding")

In [None]:
save_embedding(os.path.join(embedding_dir, "train_embedding_0-4.hdf5"), 
                X_train, 
                E_train, 
                Y_original_train_macro, 
               Y_original_train_micro,
                Y_original_names_train_macro, 
               Y_original_names_train_micro,
                Y_predicted_train, 
                Y_predicted_names_train)

In [None]:
save_embedding(os.path.join(embedding_dir, "test_embedding_0-4.hdf5"), 
                X_test, 
                E_test, 
                Y_original_test_macro, 
               Y_original_test_micro,
                Y_original_names_test_macro, 
               Y_original_names_test_micro,
                Y_predicted_test, 
                Y_predicted_names_test)

In [None]:
save_embedding(os.path.join(embedding_dir, "new_unseen_embedding_0-4.hdf5"), 
                X_new_unseen, 
                E_new_unseen, 
                Y_original_new_unseen_macro, 
               Y_original_new_unseen_micro,
                Y_original_names_new_unseen_macro, 
               Y_original_names_new_unseen_micro,
                Y_predicted_new_unseen, 
                Y_predicted_names_new_unseen)

In [None]:
save_embedding(os.path.join(embedding_dir, "drifted_embedding_5.hdf5"),
                X_drift, 
                E_drift, 
                Y_original_drift_macro, 
               Y_original_drift_micro,
                Y_original_names_drift_macro, 
               Y_original_names_drift_micro,
                Y_predicted_drift, 
                Y_predicted_names_drift)