#0 Preparations

Import required packages

In [2]:
import pandas as pd
import numpy as np
import string
import warnings
import torch

from sqlalchemy import create_engine
from sqlalchemy import text

from tqdm import tqdm

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import BertModel  
from transformers import RobertaModel  
from transformers import DistilBertModel  



# 1 Connect to SQL DB and download data

Init function to load large amount of data from sql db

In [3]:
def batch_load_sql(query: str, engine) -> pd.DataFrame:
    CHUNKSIZE = 200000
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(text(query), conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)

Create connection and download data from sql db

In [4]:
engine = create_engine("###")

post_text_df = batch_load_sql('SELECT * FROM public.post_text_df', engine=engine)


# 2 Get embeddings

In [8]:
def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  
        'roberta': 'roberta-base',  
        'distilbert': 'distilbert-base-cased'  
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [9]:
tokenizer, model = get_model('distilbert')

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

cuda:0
Tesla T4


In [12]:
model = model.to(device)

In [13]:
class Posts(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            truncation=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])

In [14]:
posts = Posts(post_text_df['text'].values.tolist(), tokenizer)

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
loader = DataLoader(posts, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [17]:
@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [18]:
embeddings = get_embeddings_labels(model, loader)

  0%|          | 0/220 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 220/220 [01:45<00:00,  2.08it/s]


In [26]:
embeddings.to(dtype=torch.float32)

tensor([[ 3.6315e-01,  4.8938e-02, -2.6408e-01,  ..., -1.4159e-01,
          1.5918e-02,  9.1769e-05],
        [ 2.3642e-01, -1.5950e-01, -3.2780e-01,  ..., -2.8994e-01,
          1.1937e-01, -1.6234e-03],
        [ 3.7519e-01, -1.1394e-01, -2.4055e-01,  ..., -3.3892e-01,
          5.8694e-02, -2.1266e-02],
        ...,
        [ 3.4038e-01,  6.6492e-02, -1.6318e-01,  ..., -8.6563e-02,
          2.0340e-01,  3.2091e-02],
        [ 4.3209e-01,  1.1092e-02, -1.1731e-01,  ...,  7.5401e-02,
          1.0274e-01,  1.5274e-02],
        [ 3.0428e-01, -7.6216e-02, -6.7759e-02,  ..., -5.4349e-02,
          2.4438e-01, -1.4148e-02]])

In [28]:
test = pd.DataFrame(embeddings)

In [31]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7023 entries, 0 to 7022
Columns: 768 entries, 0 to 767
dtypes: float32(768)
memory usage: 20.6 MB
