# Bert ebadings

Processing posts text with BERT model.

In [1]:
import os
import sys

import pathlib

notebook_path = pathlib.Path(os.getcwd())
sys.path.append(str(notebook_path.parent))

In [2]:
import pandas as pd
from database import engine

import datasets

from transformers import (
    BertModel, 
    AutoTokenizer, 
    DataCollatorWithPadding
) 

import torch
from torch.utils.data import DataLoader

from tqdm import tqdm

# model that I'll use and tokenizer for it
model = BertModel.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## Data

Loading data, tokenization and building of the loader.

In [3]:
post_data = pd.read_sql(
    "SELECT * FROM public.post_text_df;",
    con = engine,
    index_col = "post_id"
)
post_data = datasets.Dataset.from_pandas(post_data)

def tokenization(batch):
    return tokenizer.batch_encode_plus(
        batch["text"],
        add_special_tokens=True, 
        return_token_type_ids=False, 
        truncation=True
    )

post_data = post_data.map(
    tokenization, batched=True
)
post_data.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask"]
)

loader = DataLoader(
    post_data, 
    batch_size=32, 
    collate_fn=DataCollatorWithPadding(tokenizer=tokenizer), 
    pin_memory=True,
    shuffle=False
)

Map:   0%|          | 0/7023 [00:00<?, ? examples/s]

Let's check how our loader performs:

In [5]:
next(iter(loader))

{'input_ids': tensor([[  101,  1993,  4190,  ...,     0,     0,     0],
        [  101, 15386,  1116,  ...,  1300,  1107,   102],
        [  101,  3141,   186,  ..., 14099,  8478,   102],
        ...,
        [  101, 16972, 20647,  ...,     0,     0,     0],
        [  101,   137,   188,  ...,     0,     0,     0],
        [  101,   144, 22731,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

## Encoding