In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/harshada/final_train_dataset.csv


In [None]:
# Upgrade libraries to latest versions
!pip install --upgrade transformers huggingface_hub

# Restart kernel after installation (important!)
import os
os._exit(00)


Collecting transformers
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[?25hDownloading huggingface_hub-0.35.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
# Install tqdm (usually already installed in Kaggle)
!pip install tqdm

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar
import numpy as np

# Load data
df = pd.read_csv('/kaggle/input/harshada/final_train_dataset.csv')

# Combine text columns
df['text'] = df['item_name'].fillna('') + ' [SEP] ' + df['brand_name'].fillna('') + ' [SEP] ' + df['bullet_points'].fillna('') + ' [SEP] ' + df['product_description'].fillna('')

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using device: {device}")

def get_bert_embeddings(texts, batch_size=64):
    all_embeddings = []
    # Wrap the loop with tqdm for progress tracking
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating BERT Embeddings", total=len(texts)//batch_size + 1):
        batch_texts = texts[i:i+batch_size].tolist()
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)  # Move inputs to GPU
        with torch.no_grad():
            outputs = model(**inputs)
            # Use [CLS] token as embedding
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu()  # Move back to CPU
            all_embeddings.append(cls_embeddings)
    return torch.cat(all_embeddings, dim=0).numpy()

# Generate embeddings with progress bar
print("Starting embedding generation...")
embeddings = get_bert_embeddings(df['text'])
# After generating embeddings
np.save('/kaggle/working/text_embeddings.npy', embeddings)


# Add embeddings to DataFrame
print("Saving results...")
embedding_cols = [f'emb_{i}' for i in range(embeddings.shape[1])]
embeddings_df = pd.DataFrame(embeddings, columns=embedding_cols)
df_out = pd.concat([df, embeddings_df], axis=1)

# Save to CSV
df_out.to_csv('/kaggle/working/products_with_embeddings.csv', index=False)

np.save('/kaggle/working/text_embeddings.npy', bert_embeddings)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using device: cuda
Starting embedding generation...


Generating BERT Embeddings: 100%|██████████| 877/877 [07:18<00:00,  2.00it/s]


Saving results...
✅ Done! File saved to /kaggle/working/products_with_embeddings.csv


In [None]:
df_out.to_csv('/kaggle/working/products_with_embeddings.csv', index=False)