In [2]:
from transformers import AutoTokenizer, AutoModel
import pickle
from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
import gc

In [3]:
class StringDataset(Dataset):
    def __init__(self, strings):
        self.strings = strings

    def __len__(self):
        return len(self.strings)

    def __getitem__(self, idx):
        return self.strings[idx]

In [4]:
class Pooling:
    def __init__(self, pooling_type):
        self.pooling_type = pooling_type

    def __call__(self, hidden_states, layer_number, attention_mask=None):
        if self.pooling_type == "cls":
            return hidden_states[layer_number][:, 0, :]
        elif self.pooling_type == "mean":
            token_embeddings = hidden_states[
                layer_number
            ]  # First element of model_output contains all token embeddings
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
                input_mask_expanded.sum(1), min=1e-9
            )
        elif self.pooling_type == "max":
            token_embeddings = hidden_states[layer_number]
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
            max_embeddings = torch.max(token_embeddings, 1)[0]
            return max_embeddings
        else:
            raise ValueError("Wrong pooling method provided in the Pooler initialization")

In [5]:
class EmbeddingsRetriever:
    def __init__(self, embedding_model, tokenizer):
        self.embedding_model = embedding_model
        self.tokenizer = tokenizer
        self.functions = [self.get_embedding_layer_output, 
                          self.get_embedding_last_hidden_layer, 
                          self.get_embedding_sum_all_layers, 
                          self.get_embedding_second_last_layer, 
                          self.get_embedding_sum_last_four_layers, 
                          self.get_embedding_concat_last_four_layers]

    def tokenize_and_produce_model_output(self, data):
        encoded_input = self.tokenizer(data, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        self.attention_mask = encoded_input["attention_mask"]
        with torch.no_grad():
            self.model_output = self.embedding_model(**encoded_input)

    def get_embedding_from_layer(self, layer_number, pooling="cls"):
        hidden_states = self.model_output["hidden_states"]
        if pooling == "cls":
            pooler = Pooling("cls")
            return pooler(hidden_states, layer_number)

        elif pooling == "mean":
            assert self.attention_mask != None, "Please provide attention mask if you are using mean pooling"
            pooler = Pooling("mean")
            return pooler(hidden_states, layer_number, self.attention_mask)

        elif pooling == "max":
            assert self.attention_mask != None, "Please provide attention mask if you are using max pooling"
            pooler = Pooling("max")
            return pooler(hidden_states, layer_number, self.attention_mask)
        else:
            raise ValueError("Wrong pooling method provided in the function call")

    def get_embedding_layer_output(self, pooling="cls"):
        return self.get_embedding_from_layer(0, pooling).numpy()

    def get_embedding_last_hidden_layer(self, pooling="cls"):
        return self.get_embedding_from_layer(-1, pooling).numpy()

    def get_embedding_sum_all_layers(self, pooling="cls"):
        outputs = []
        for layer in range(13):
            output = self.get_embedding_from_layer(layer, pooling)
            outputs.append(output.numpy())
        return sum(outputs)

    def get_embedding_second_last_layer(self, pooling="cls"):
        return self.get_embedding_from_layer(-2, pooling).numpy()

    def get_embedding_sum_last_four_layers(self, pooling="cls"):
        outputs = []
        layers = [-4, -3, -2, -1]
        for layer in layers:
            output = self.get_embedding_from_layer(layer, pooling)
            outputs.append(output.numpy())
        return sum(outputs)

    def get_embedding_concat_last_four_layers(self, pooling="cls"):
        outputs = []
        layers = [-4, -3, -2, -1]
        for layer in layers:
            output = self.get_embedding_from_layer(layer, pooling)
            outputs.append(output)
        return torch.cat(outputs, dim=-1).numpy()

In [6]:
data = pd.read_csv("/kaggle/input/corpus-trump/corpus_trump.tsv", sep="\t", header=None)
texts = data.iloc[:, 0].tolist()
del data

In [7]:
texts_part_1 = texts[:20000]
texts_part_2 = texts[20000:]

In [8]:
dataset_first = StringDataset(texts_part_1)
dataloader_first = DataLoader(dataset_first, batch_size=64)

In [9]:
dataset_second = StringDataset(texts_part_2)
dataloader_second = DataLoader(dataset_second, batch_size=64)

In [10]:
del texts_part_1
del texts_part_2
del texts
gc.collect()

60

In [11]:
model_name = "sentence-transformers/all-MiniLM-L12-v2"  # Replace with the appropriate model name
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedder = EmbeddingsRetriever(model, tokenizer)

In [None]:
poolings = ["mean", "cls", "max"]
embeddings_dict = {function.__name__ + "_"  + pooling: [] for function in embedder.functions for pooling in poolings}

In [None]:
for batch in tqdm(dataloader_first):
    embedder.tokenize_and_produce_model_output(batch)
    for function in embedder.functions:
        for pooling in poolings:
            embedding = function(pooling=pooling)
            embeddings_dict[function.__name__ + "_" + pooling].extend(embedding)
            del embedding
            gc.collect()
  

In [None]:
len(embeddings_dict["get_embedding_layer_output_mean"])

In [None]:
np.save('embeddings_trump_part1.npy', embeddings_dict) 

In [12]:
poolings = ["mean", "cls", "max"]
embeddings_dict = {function.__name__ + "_"  + pooling: [] for function in embedder.functions for pooling in poolings}

In [13]:
for batch in tqdm(dataloader_second):
    embedder.tokenize_and_produce_model_output(batch)
    for function in embedder.functions:
        for pooling in poolings:
            embedding = function(pooling=pooling)
            embeddings_dict[function.__name__ + "_" + pooling].extend(embedding)
            del embedding
            gc.collect()
  

100%|██████████| 379/379 [1:01:39<00:00,  9.76s/it]


In [14]:
len(embeddings_dict["get_embedding_layer_output_mean"])

24252

In [15]:
np.save('embeddings_trump_part2.npy', embeddings_dict) 