In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DistilBertConfig, DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers.pipelines import pipeline
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from torch import nn

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
base_dir = "use_case/hf-distillbert-ag_news-0-1-2-split66"

df_train = pd.read_csv(os.path.join(base_dir,"dataset","df_train_0_1_2_split_66.csv"))
df_test = pd.read_csv(os.path.join(base_dir,"dataset","df_test_0_1_2_split_66.csv"))
df_new_unseen = pd.read_csv(os.path.join(base_dir,"dataset","df_new_unseen_0_1_2_split_66.csv"))
df_drifted = pd.read_csv(os.path.join(base_dir,"dataset","df_drifted_3_split_66.csv"))

In [7]:
OUTPUT_DIR = "use_case/hf-distillbert-ag_news-0-1-2-split66/saved_model/best_model"
CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"
BERT_MODEL = "distilbert-base-uncased" 

config = DistilBertConfig.from_pretrained(os.path.join(OUTPUT_DIR, CONFIG_NAME), output_hidden_states=True)
model = DistilBertForSequenceClassification.from_pretrained(os.path.join(OUTPUT_DIR), config=config)
model = model.to(device)
tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

In [None]:
#model_name_or_path = "use_case/hf-bert-ag_news-0-1-2-split66/saved_model/best_model"

#model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, output_hidden_states=True)
#tokenizer_2 = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
#pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

In [8]:
tokenizer_kwargs = {"padding":"max_length", "truncation":True}

In [None]:
#batch_test_preds = pipe("money", **tokenizer_kwargs)

In [None]:
#batch_test_preds

In [9]:
inputs = tokenizer(df_test["text"].iloc[:2].tolist(), padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs.to(device))
    
print(outputs["logits"])

tensor([[-1.1092, -4.9994,  5.0207],
        [-2.0539,  5.4114, -4.5275]], device='cuda:0')


In [None]:
#outputs["logits"]

In [14]:
len(outputs["hidden_states"][6])

2

In [None]:
#outputs["hidden_states"][12]

In [None]:
#hs_np = outputs["hidden_states"][12].detach().cpu().numpy()

In [None]:
#hs_first_tk = hs_np[:, 0, :]

In [None]:
#hs_first_tk[1][:10]

In [None]:
#hs_first_tk.shape

In [None]:
#arr = np.empty((0,768))

In [None]:
#new_arr = np.vstack([arr, hs_first_tk])

In [None]:
#new_arr = np.vstack([new_arr, hs_first_tk])

In [None]:
#probabilities = nn.functional.softmax(outputs["logits"], dim=-1)
#labels = torch.argmax(probabilities, dim=1).tolist()
#print(probabilities.tolist())
#print(labels)

In [15]:
train_id2label = ["World", "Sports", "Business", "Sci/Tech"]

In [18]:
def extract_embedding_and_predict(model, tokenizer, df):
    
    X = df["text"].tolist() # List of input texts
    Y_original = df["label"].tolist() # List of original labels (GT)
    Y_original_names = [train_id2label[l] for l in Y_original]  # List of original labels' names (GT)
    E = np.empty((0,768)) # Initialize empty array of embeddings
    Y_predicted = [] # Initialize empty list of predicted labels (IDs)
    Y_predicted_names = [] # Initialize empty list of predicted labels (Names)
    
    
    BATCH_SIZE = 256
    n_batch = len(df)//BATCH_SIZE
    remainer = len(df)%BATCH_SIZE
    
    for i in tqdm(range(n_batch)):
        input_texts = df["text"].iloc[i*BATCH_SIZE:i*BATCH_SIZE+BATCH_SIZE].tolist()
        
        tokenized_texts = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**tokenized_texts.to(device))
            
        batch_probabilities = nn.functional.softmax(outputs["logits"], dim=-1)
        batch_labels = torch.argmax(batch_probabilities, dim=1).tolist()

        batch_probabilities_list = batch_probabilities.tolist()            
        batch_labels_name = [train_id2label[l] for l in batch_labels] 

        Y_predicted.extend(batch_labels)
        Y_predicted_names.extend(batch_labels_name)

        last_layer_hidden_states_arr = outputs["hidden_states"][6].detach().cpu().numpy()                   
        embedding_CLS_arr = last_layer_hidden_states_arr[:, 0, :] # [BATCH_SIZE, 0 = CLS, 768]
        E = np.vstack([E, embedding_CLS_arr])
            
    input_texts = df["text"].iloc[-remainer:].tolist()
        
    tokenized_texts = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**tokenized_texts.to(device))

    batch_probabilities = nn.functional.softmax(outputs["logits"], dim=-1)
    batch_labels = torch.argmax(batch_probabilities, dim=1).tolist()

    batch_probabilities_list = batch_probabilities.tolist()            
    batch_labels_name = [train_id2label[l] for l in batch_labels] 

    Y_predicted.extend(batch_labels)
    Y_predicted_names.extend(batch_labels_name)

    last_layer_hidden_states_arr = outputs["hidden_states"][6].detach().cpu().numpy()                   
    embedding_CLS_arr = last_layer_hidden_states_arr[:, 0, :] # [BATCH_SIZE, 0 = CLS, 768]
    E = np.vstack([E, embedding_CLS_arr])
        
    return X, E, Y_original, Y_original_names, Y_predicted, Y_predicted_names

In [19]:
X_test, E_test, Y_original_test, Y_original_names_test, Y_predicted_test, Y_predicted_names_test = extract_embedding_and_predict(model, tokenizer, df_test)

100%|██████████| 22/22 [00:11<00:00,  1.87it/s]


In [20]:
X_train, E_train, Y_original_train, Y_original_names_train, Y_predicted_train, Y_predicted_names_train = extract_embedding_and_predict(model, tokenizer, df_train)

100%|██████████| 232/232 [02:28<00:00,  1.56it/s]


In [21]:
X_drift, E_drift, Y_original_drift, Y_original_names_drift, Y_predicted_drift, Y_predicted_names_drift = extract_embedding_and_predict(model, tokenizer, df_drifted)

100%|██████████| 124/124 [01:27<00:00,  1.41it/s]


In [22]:
X_new_unseen, E_new_unseen, Y_original_new_unseen, Y_original_names_new_unseen, Y_predicted_new_unseen, Y_predicted_names_new_unseen = extract_embedding_and_predict(model, tokenizer, df_new_unseen)

100%|██████████| 119/119 [01:11<00:00,  1.67it/s]


In [23]:
import h5py

In [24]:
def save_embedding(output_path, X, E, Y_original, Y_original_names, Y_predicted, Y_predicted_names):

    fp = h5py.File(output_path, "w")

    fp.create_dataset("X", data=X, compression="gzip")
    fp.create_dataset("E", data=E, compression="gzip")
    fp.create_dataset("Y_original", data=Y_original, compression="gzip")
    fp.create_dataset("Y_original_names", data=Y_original_names, compression="gzip")
    fp.create_dataset("Y_predicted", data=Y_predicted, compression="gzip")
    fp.create_dataset("Y_predicted_names", data=Y_predicted_names, compression="gzip")
    fp.close()
    return

In [25]:
embedding_dir = os.path.join(base_dir, "saved_embedding")

In [26]:
save_embedding(os.path.join(embedding_dir, "train_embedding_0_1_2.hdf5"), 
                X_train, 
                E_train, 
                Y_original_train, 
                Y_original_names_train, 
                Y_predicted_train, 
                Y_predicted_names_train)

In [27]:
save_embedding(os.path.join(embedding_dir, "test_embedding_0_1_2.hdf5"), 
                X_test, 
                E_test, 
                Y_original_test, 
                Y_original_names_test, 
                Y_predicted_test, 
                Y_predicted_names_test)

In [28]:
save_embedding(os.path.join(embedding_dir, "drifted_embedding_3.hdf5"), 
                X_drift, 
                E_drift, 
                Y_original_drift, 
                Y_original_names_drift, 
                Y_predicted_drift, 
                Y_predicted_names_drift)

In [29]:
save_embedding(os.path.join(embedding_dir, "new_unseen_embedding_0_1_2.hdf5"), 
                X_new_unseen, 
                E_new_unseen, 
                Y_original_new_unseen, 
                Y_original_names_new_unseen, 
                Y_predicted_new_unseen, 
                Y_predicted_names_new_unseen)