In [3]:
import torch
import sklearn
import transformers
import numpy as np
from transformers import BertModel, BertTokenizerFast
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


In [21]:
print(
    "transformers: " + transformers.__version__,
    "torch: "        + torch.__version__,
    "sklearn: "      + sklearn.__version__,
    "numpy: "        + np.__version__,
)

transformers: 4.18.0 torch: 1.12.1 sklearn: 1.1.1 numpy: 1.21.5


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [12]:
# textual comments for product
sentiment_data_textual = [
    "This microwave that I bought in your store is awesome!",
    "This device is really bad, I won't recommend to buy it",
    "My daughter is happy with this reeader. I recommend it!",
    "I can't say that I will buy this bulb again"]

# numerical and categorical data after encoding. Example of the format: binary_data(True|False), one-hot category(category_1|category_2|category_3|category_4), price, num of views.
sentiment_data_tabular = [
    [1, 0, 0, 1, 0, 811.32, 340],
    [0, 1, 0, 0, 0, 1000.0, 20],
    [1, 0, 1, 0, 0, 200.74, 1200],
    [0, 0, 0, 0, 1, 1.1,     17],
]

# Classes: Positive/negative sentiment
y = [
    1,
    0,
    1,
    0
]

In [7]:
# getting tokens for textual data
tokens = tokenizer(sentiment_data_textual, padding=True, truncation=True, return_tensors="pt")
print(len(tokens))

3


In [8]:
model = model.eval()

In [10]:
# Getting output of the bert for our texts
with torch.no_grad():
    text_embeddings = model(tokens['input_ids'])

In [14]:
# at the beginning we ca try to use embeddings for classification task of the bert. If they will work not so good we hat try to take other embeddings
text_embeddings = text_embeddings.pooler_output

In [15]:
# concatenating of the tabular and textual data
X = np.array(sentiment_data_tabular)
X = np.concatenate([X, text_embeddings], axis=1)

In [16]:
X.shape

(4, 775)

In [17]:
# creating of the RF classifier. XGBoost or any other mode can be here
clf = RandomForestClassifier(max_depth=10)
clf.fit(X, y)

In [19]:
# predicting with the pre-trained classifier
prediction = clf.predict(X[0:1])
print("Predicted class for the first example is:", "Positive" if prediction[0] > 0.5 else "Negative")

Predicted class for the first example is: Positive
