In [1]:
from transformers import DistilBertTokenizerFast, DistilBertModel
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from math import ceil
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from numpy import load

In [2]:
DATASET = ["meta_Beauty_and_Personal_Care", "meta_Books", "meta_Home_and_Kitchen"]
LABELS = ["personal_care", "book", "home"]
COLUMN_SELECTIONS = ["main_category", "title", "features"]

label_to_id = {"personal_care": 0, "book": 1, "home": 2}

documents = []
ytrain = []
for label in LABELS:
    # Read the extracted raw data
    df_temp = pd.read_parquet(f"../../data/raw/{label}.parquet")
    df_temp.title = df_temp.title.astype("str")
    df_temp.features = df_temp.features.astype("str")
    # Convert text to lower case
    df_temp = df_temp.drop("main_category", axis=1).apply(lambda x: x.str.lower())
    # Append non-empty string to documents
    documents += df_temp.title[df_temp.title != ""].to_list()
    # documents += df_temp.features[df_temp.features != ""].to_list()
    ytrain += [label_to_id[label]] * len(df_temp.title[df_temp.title != ""])

print(len(documents))

2999754


In [3]:
# Load the tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Setup backend device
if torch.cuda.is_available():
    # Check for CUDA (traditional GPUs)
    device = torch.device("cuda")
    print("PyTorch is using CUDA.")
elif torch.backends.mps.is_available():
    # Check for MPS (Apple Silicon GPUs)
    device = torch.device("mps")
    print("PyTorch is using MPS.")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU.")

PyTorch is using MPS.


In [14]:
# Define batch size
batch_size = 200

# Send data to GPU
model.to(device)

# Function to get embeddings for a batch of texts
def get_embeddings(texts_batch):
    inputs = tokenizer(texts_batch, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings.cpu().numpy()

In [None]:
# Convert word to vector
from math import ceil

num_batch = ceil(len(documents) / batch_size)
for i in range(num_batch):
    if i % 4 == 0:
        print(f"Batch {i+1}") 
    if i == 0:
        xtrain = get_embeddings(documents[i * batch_size:(i + 1) * batch_size])
    elif i == num_batch - 1:
        xtrain = np.vstack([xtrain, get_embeddings(documents[i * batch_size:len(documents)])])
    else:
        xtrain = np.vstack([xtrain, get_embeddings(documents[i * batch_size:(i + 1) * batch_size])])

In [7]:
logmod = LogisticRegression(solver="lbfgs", max_iter=1000).fit(xtrain, ytrain)

ypred = logmod.predict(xtrain)

print(accuracy_score(ytrain, ypred))
print(confusion_matrix(ytrain, ypred))

In [14]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Move model to GPU
model.to(device)

# Extract the embedding layer
# embedding_layer = model.embeddings
embedding_layer = model.embeddings

# Example texts
texts = documents

# Tokenize texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create a TensorDataset
dataset = TensorDataset(inputs['input_ids'])

# Define batch size and DataLoader
batch_size = 200
dataloader = DataLoader(dataset, batch_size=batch_size)

# Function to get embeddings for a batch of input_ids
def get_embeddings(input_ids):
    input_ids = input_ids.to(device)
    with torch.no_grad():
        embeddings = embedding_layer(input_ids)
        embeddings = embeddings.mean(dim=1)  # Mean pooling
    return embeddings.cpu().numpy()

# Process texts in batches using DataLoader
all_embeddings = []
for batch in dataloader:
    input_ids = batch[0]
    batch_embeddings = get_embeddings(input_ids)
    all_embeddings.append(batch_embeddings)

# Concatenate all embeddings
all_embeddings_np = np.concatenate(all_embeddings, axis=0)

In [5]:
# # save numpy array as npz file
# from numpy import asarray
# from numpy import savez_compressed

# # save to npy file
# savez_compressed('data.npz', all_embeddings_np)
# savez_compressed('ytrain_label.npz', ytrain)

In [2]:
# load dict of arrays
dict_data = load('data.npz')
# extract the first array
X = dict_data['arr_0']

# load dict of arrays
dict_data = load('ytrain_label.npz')
# extract the first array
y = dict_data['arr_0']

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=0)

for train_idx, test_idx in splitter.split(X, y):
    X_train = [X[i] for i in train_idx]
    y_train = [y[i] for i in train_idx]
    X_test = [X[i] for i in test_idx]
    y_test = [y[i] for i in test_idx]

In [5]:
y_train = np.array(y_train)

In [12]:
logmod = LogisticRegression(solver="lbfgs", max_iter=500).fit(X_train, y_train)

# logmod = SGDClassifier(loss="log_loss", n_jobs=-1, max_iter=1000)
# batch_size = 100000
# num_batch = ceil(len(all_embeddings_np) / batch_size)
# for i in range(num_batch):
#     print(f"Batch {i+1}") 
#     if i == num_batch - 1:
#         logmod.partial_fit(all_embeddings_np[i * batch_size : len(all_embeddings_np)], ytrain[i * batch_size : len(all_embeddings_np)], classes=[0, 1, 2])
#     else:
#         logmod.partial_fit(all_embeddings_np[i * batch_size : (i + 1) * batch_size], ytrain[i * batch_size : (i + 1) * batch_size], classes=[0, 1, 2])

In [None]:
ypred = logmod.predict(X_train)

In [None]:
print(accuracy_score(y_train, ypred))
print(confusion_matrix(y_train, ypred))

In [21]:
ypred = logmod.predict(X_test)
print(accuracy_score(y_test, ypred))
print(confusion_matrix(y_test, ypred))

0.9379540995847995
[[369205   8040  22734]
 [  3073 390595   6332]
 [ 21439  12831 365653]]


In [23]:
from sklearn.metrics import classification_report


print(classification_report(y_test, ypred))


              precision    recall  f1-score   support

           0       0.94      0.92      0.93    399979
           1       0.95      0.98      0.96    400000
           2       0.93      0.91      0.92    399923

    accuracy                           0.94   1199902
   macro avg       0.94      0.94      0.94   1199902
weighted avg       0.94      0.94      0.94   1199902



In [22]:
import joblib
joblib.dump(logmod, "../../src/models/lr_lable_bert_emb_full_60_0.joblib", compress=3)

['../../src/models/lr_lable_bert_emb_full_60_0.joblib']