In [33]:
from transformers import AutoTokenizer, BertModel
import torch
import tqdm

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
device = "cuda:1" if torch.cuda.is_available() else "cpu"
inputs = tokenizer(["Hello, my dog is cute"], return_tensors="pt").to(device)
model  = model.to(device)
outputs = model(**inputs)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
%pip install lsg-converter

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting lsg-converter
  Downloading lsg_converter-0.0.5-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: lsg-converter
Successfully installed lsg-converter-0.0.5
[0mNote: you may need to restart the kernel to use updated packages.


In [38]:
from lsg_converter import LSGConverter

converter = LSGConverter(max_sequence_length=4096)

# Example 1
model, tokenizer = converter.convert_from_pretrained("bert-base-uncased", num_global_tokens=7)
model  = model.to(device)


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing LSGBertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing LSGBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LSGBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LSGBertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.embeddings.gl

In [42]:
inputs = tokenizer(["Hello, my dog is cute"], return_tensors="pt").to(device)
outputs = model(**inputs)

In [51]:
from transformers import AutoTokenizer, BertModel
import torch
import pandas as pd

def make_embed(df):
    df=df.fillna("Not Avaliable")
    ls_embed=[]
    ls_id=[]

    for i in tqdm(range(df.shape[0])):
        rw=df.iloc[i]
        x=[rw["TITLE"]+" "+rw["BULLET_POINTS"]+" "+rw["DESCRIPTION"]]
        id=rw["PRODUCT_ID"]
        inputs = tokenizer(x, return_tensors="pt").to(device)
        outputs = model(**inputs)
        last_hidden_states = outputs.logits[:,0,:]
        last_hidden_states=last_hidden_states.cpu().detach().numpy()
        ls_embed.append(last_hidden_states)
        ls_id.append(id)
    
    nw_df=pd.DataFrame(ls_embed)

    return nw_df

In [49]:
test_df=pd.read_csv("dataset/test.csv")
train_df=pd.read_csv("dataset/train.csv")

In [52]:
test_embed=make_embed(test_df)
train_embed=make_embed(train_df)