In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaTokenizer,RobertaForSequenceClassification, RobertaConfig
from transformers.pipelines import pipeline
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
data_dir = "static/data/roberta"

df_train = pd.read_csv(os.path.join(data_dir, "df_train_0_1_2.csv"))
df_test = pd.read_csv(os.path.join(data_dir, "df_test_0_1_2.csv"))
df_new_unseen = pd.read_csv(os.path.join(data_dir, "df_new_unseen_0_1_2.csv"))
df_drifted = pd.read_csv(os.path.join(data_dir, "df_drifted_3.csv"))

In [4]:
MODEL_DIR = "static/saved_models/roberta/best_model"
CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"
BERT_MODEL = 'roberta-base' # BERT model type

config = RobertaConfig.from_pretrained(os.path.join(MODEL_DIR, CONFIG_NAME), output_hidden_states=True)
model = RobertaForSequenceClassification.from_pretrained(os.path.join(MODEL_DIR), config=config)
model = model.to(device)
tokenizer = RobertaTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

In [5]:
tokenizer_kwargs = {"padding":"max_length", "truncation":True}

In [23]:
train_id2label = ["World", "Sports", "Business", "Sci/Tech"]

In [24]:
def extract_embedding_and_predict(model, tokenizer, df):
    
    X = df["text"].tolist() # List of input texts
    Y_original = df["label"].tolist() # List of original labels (GT)
    Y_original_names = [train_id2label[l] for l in Y_original]  # List of original labels' names (GT)
    E = np.empty((0,768)) # Initialize empty array of embeddings
    Y_predicted = [] # Initialize empty list of predicted labels (IDs)
    Y_predicted_names = [] # Initialize empty list of predicted labels (Names)
    
    
    BATCH_SIZE = 256
    n_batch = len(df)//BATCH_SIZE
    remainer = len(df)%BATCH_SIZE
    
    for i in tqdm(range(n_batch)):
        input_texts = df["text"].iloc[i*BATCH_SIZE:i*BATCH_SIZE+BATCH_SIZE].tolist()
        
        tokenized_texts = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**tokenized_texts.to(device))
            
        batch_probabilities = nn.functional.softmax(outputs["logits"], dim=-1)
        batch_labels = torch.argmax(batch_probabilities, dim=1).tolist()

        batch_probabilities_list = batch_probabilities.tolist()            
        batch_labels_name = [train_id2label[l] for l in batch_labels] 

        Y_predicted.extend(batch_labels)
        Y_predicted_names.extend(batch_labels_name)

        last_layer_hidden_states_arr = outputs["hidden_states"][12].detach().cpu().numpy()                   
        embedding_CLS_arr = last_layer_hidden_states_arr[:, 0, :] # [BATCH_SIZE, 0 = CLS, 768]
        E = np.vstack([E, embedding_CLS_arr])
            
    input_texts = df["text"].iloc[-remainer:].tolist()
        
    tokenized_texts = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**tokenized_texts.to(device))

    batch_probabilities = nn.functional.softmax(outputs["logits"], dim=-1)
    batch_labels = torch.argmax(batch_probabilities, dim=1).tolist()

    batch_probabilities_list = batch_probabilities.tolist()            
    batch_labels_name = [train_id2label[l] for l in batch_labels] 

    Y_predicted.extend(batch_labels)
    Y_predicted_names.extend(batch_labels_name)

    last_layer_hidden_states_arr = outputs["hidden_states"][12].detach().cpu().numpy()                   
    embedding_CLS_arr = last_layer_hidden_states_arr[:, 0, :] # [BATCH_SIZE, 0 = CLS, 768]
    E = np.vstack([E, embedding_CLS_arr])
        
    return X, E, Y_original, Y_original_names, Y_predicted, Y_predicted_names

In [25]:
X_test, E_test, Y_original_test, Y_original_names_test, Y_predicted_test, Y_predicted_names_test = extract_embedding_and_predict(model, tokenizer, df_test)

100%|██████████| 22/22 [00:14<00:00,  1.50it/s]


In [26]:
X_train, E_train, Y_original_train, Y_original_names_train, Y_predicted_train, Y_predicted_names_train = extract_embedding_and_predict(model, tokenizer, df_train)

100%|██████████| 232/232 [02:57<00:00,  1.30it/s]


In [27]:
X_drift, E_drift, Y_original_drift, Y_original_names_drift, Y_predicted_drift, Y_predicted_names_drift = extract_embedding_and_predict(model, tokenizer, df_drifted)

100%|██████████| 124/124 [01:52<00:00,  1.11it/s]


In [28]:
X_new_unseen, E_new_unseen, Y_original_new_unseen, Y_original_names_new_unseen, Y_predicted_new_unseen, Y_predicted_names_new_unseen = extract_embedding_and_predict(model, tokenizer, df_new_unseen)

100%|██████████| 119/119 [01:25<00:00,  1.39it/s]


In [29]:
import h5py

In [30]:
def save_embedding(output_path, X, E, Y_original, Y_original_names, Y_predicted, Y_predicted_names):

    fp = h5py.File(output_path, "w")

    fp.create_dataset("X", data=X, compression="gzip")
    fp.create_dataset("E", data=E, compression="gzip")
    fp.create_dataset("Y_original", data=Y_original, compression="gzip")
    fp.create_dataset("Y_original_names", data=Y_original_names, compression="gzip")
    fp.create_dataset("Y_predicted", data=Y_predicted, compression="gzip")
    fp.create_dataset("Y_predicted_names", data=Y_predicted_names, compression="gzip")
    fp.close()
    return

In [31]:
embedding_dir = os.path.join("static", "saved_embeddings", "roberta")

In [32]:
save_embedding(os.path.join(embedding_dir, "train_embedding_0_1_2.hdf5"), 
                X_train, 
                E_train, 
                Y_original_train, 
                Y_original_names_train, 
                Y_predicted_train, 
                Y_predicted_names_train)

In [33]:
save_embedding(os.path.join(embedding_dir, "test_embedding_0_1_2.hdf5"), 
                X_test, 
                E_test, 
                Y_original_test, 
                Y_original_names_test, 
                Y_predicted_test, 
                Y_predicted_names_test)

In [34]:
save_embedding(os.path.join(embedding_dir, "drifted_embedding_3.hdf5"), 
                X_drift, 
                E_drift, 
                Y_original_drift, 
                Y_original_names_drift, 
                Y_predicted_drift, 
                Y_predicted_names_drift)

In [35]:
save_embedding(os.path.join(embedding_dir, "new_unseen_embedding_0_1_2.hdf5"), 
                X_new_unseen, 
                E_new_unseen, 
                Y_original_new_unseen, 
                Y_original_names_new_unseen, 
                Y_predicted_new_unseen, 
                Y_predicted_names_new_unseen)