In [1]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [22]:
DATASET = ["meta_Beauty_and_Personal_Care", "meta_Books", "meta_Home_and_Kitchen"]
LABELS = ["personal_care", "book", "home"]
COLUMN_SELECTIONS = ["main_category", "title", "features"]

documents = []
for label in LABELS:
    # Read the extracted raw data
    df_temp = pd.read_parquet(f"../../data/raw/{label}.parquet")
    df_temp.title = df_temp.title.astype("str")
    df_temp.features = df_temp.features.astype("str")
    # Convert text to lower case
    df_temp = df_temp.drop("main_category", axis=1).apply(lambda x: x.str.lower())
    # Append non-empty string to documents
    documents += df_temp.title[df_temp.title != ""].to_list()[:10000]
    # documents += df_temp.features[df_temp.features != ""].to_list()

print(len(documents))

30000


In [3]:
# Setup backend device
if torch.cuda.is_available():
    # Check for CUDA (traditional GPUs)
    device = torch.device("cuda")
    print("PyTorch is using CUDA.")
elif torch.backends.mps.is_available():
    # Check for MPS (Apple Silicon GPUs)
    device = torch.device("mps")
    print("PyTorch is using MPS.")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU.")

PyTorch is using CUDA.


In [6]:
# Define batch size
batch_size = 200

# Send data to GPU
model.to(device)

# Function to get embeddings for a batch of texts
def get_embeddings(texts_batch):
    inputs = tokenizer(texts_batch, padding=True, truncation=True, return_tensors="pt").to(device)
    # inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings.cpu().numpy()

In [23]:
# Convert word to vector
from math import ceil

num_batch = ceil(len(documents) / batch_size)
for i in range(num_batch):
    print(f"Batch {i+1}")
    if i == 0:
        xtrain = get_embeddings(documents[i * batch_size:(i + 1) * batch_size])
    elif i == num_batch - 1:
        xtrain = np.vstack([xtrain, get_embeddings(documents[i * batch_size:len(documents)])])
    else:
        xtrain = np.vstack([xtrain, get_embeddings(documents[i * batch_size:(i + 1) * batch_size])])

In [26]:
# Dummy labels
ytrain = np.array([0] * 10000 + [1] * 10000 + [2] * 10000)

In [30]:
from sklearn.linear_model import LogisticRegression
logmod = LogisticRegression(solver="lbfgs", max_iter=1000).fit(xtrain, ytrain)

In [32]:
ypred = logmod.predict(xtrain)

In [33]:
(ypred == ytrain).sum() / len(ytrain)

0.9726