In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [17]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset, Dataset
from trl import SFTTrainer, setup_chat_format
import numpy as np
import pandas as pd
import transformers
from tqdm.auto import tqdm



In [18]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    quantization_config=bnb_config,
    attn_implementation="eager",
    num_labels=1,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Load LoRA weights on top
model = PeftModel.from_pretrained(base_model, "/kaggle/input/bert-fine-tuned-lora/bert-ft-normal/bert-ft-normal").eval()

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (de

In [21]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/bert-fine-tuned-lora/bert-ft-normal/bert-ft-normal")

In [22]:
from torch.utils.data import DataLoader, TensorDataset

In [23]:
def get_embeddings(texts, model, tokenizer, batch_size=32):
    """
    Generate embeddings for a list of texts.
    Returns the [CLS] token embeddings from BERT's last hidden layer.
    """
    embeddings_list = []
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )
        
        # Move to GPU if available
        input_ids = encoded['input_ids'].to(model.device)
        attention_mask = encoded['attention_mask'].to(model.device)
        
        # Get embeddings (no gradient calculation needed)
        with torch.no_grad():
            outputs = model.base_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True
            )
            
            # Get [CLS] token embedding from last hidden layer
            # Shape: (batch_size, hidden_size)
            cls_embeddings = outputs.hidden_states[-1][:, 0, :]
            
        embeddings_list.append(cls_embeddings.cpu())
    
    # Concatenate all batches
    return torch.cat(embeddings_list, dim=0).numpy()

In [24]:
df = pd.read_csv('/kaggle/input/my-dataset/final_train_dataset.csv')

In [25]:
import ast

df["bullet_points"] = df["bullet_points"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
# Merge the list into a single string
df["bullet_points"] = df["bullet_points"].apply(
    lambda x: " ".join(x) if isinstance(x, list) else x
)

In [26]:
df["text"] = (df["item_name"].fillna("") + " " +
    df["bullet_points"].fillna("") + " " +
    "value: " + df["value"].fillna("").astype(str) + " " +
    "unit: " + df["unit"].fillna("")
                )

In [27]:
# Generate embeddings for train set
train_texts = df['text'].tolist()
train_embeddings = get_embeddings(train_texts, model, tokenizer)

  0%|          | 0/1754 [00:00<?, ?it/s]

In [28]:
train_embeddings.shape

(56102, 768)

In [29]:
type(train_embeddings)

numpy.ndarray

In [31]:
np.save('/kaggle/working/finetuned-bert-train.npy', train_embeddings)

In [32]:
dftest = pd.read_csv('/kaggle/input/my-dataset/final_test_dataset.csv')

In [33]:
import ast

dftest["bullet_points"] = dftest["bullet_points"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
# Merge the list into a single string
dftest["bullet_points"] = dftest["bullet_points"].apply(
    lambda x: " ".join(x) if isinstance(x, list) else x
)

In [34]:
dftest["text"] = (dftest["item_name"].fillna("") + " " +
    dftest["bullet_points"].fillna("") + " " +
    "value: " + dftest["value"].fillna("").astype(str) + " " +
    "unit: " + dftest["unit"].fillna("")
                )

In [35]:
dftest.head()

Unnamed: 0,sample_id,catalog_content,image_link,item_name,brand_name,bullet_points,product_description,value,unit,text
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,Rani,youll love 14spice eshamayas mango chutney chu...,mango chutney made diced green mangoes cooked ...,10.5,Ounce,Rani 14-Spice Eshamaya's Mango Chutney (Indian...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...,Natural MILK TEA Flavoring extract by HALO PAN...,Natural,authentic tasting asianinspired natural flavor...,check popular milk tea flavoring extract new l...,2.0,Fluid_Ounce,Natural MILK TEA Flavoring extract by HALO PAN...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,Honey,honey filled hard candy 2pound bulk pack appro...,honey filled hard candy bulk pack pounds indiv...,32.0,Ounce,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Ou...,https://m.media-amazon.com/images/I/51Ex6uOH7y...,Vlasic Snack'mm's Kosher Dill 16 Ounce (Pack o...,Vlasic,,,2.0,Count,Vlasic Snack'mm's Kosher Dill 16 Ounce (Pack o...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...,"McCormick Culinary Vanilla Extract, 32 Fluid_O...",McCormick,premium ingredients mccormick culinary pure va...,,32.0,Fluid_Ounce,"McCormick Culinary Vanilla Extract, 32 Fluid_O..."


In [37]:
type(dftest['text'][0])

str

In [38]:
# Generate embeddings for test set
test_texts = dftest['text'].tolist()  # Assuming you have df_test
test_embeddings = get_embeddings(test_texts, model, tokenizer)

  0%|          | 0/2344 [00:00<?, ?it/s]

In [39]:
np.save('/kaggle/working/finetuned-bert-test.npy', test_embeddings)