In [36]:
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"

In [27]:
ds = load_dataset("rkf2778/amazon_reviews_mobile_electronics")

In [28]:
login(token="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")

In [29]:
def preprocess(example):
    if example["star_rating"] in [1, 2]:
        return {"text": example["review_body"], "label": 0}
    elif example["star_rating"] in [4, 5]:
        return {"text": example["review_body"], "label": 1}
    else:
        return None 

In [30]:
filtered_dataset = ds["train"].filter(lambda e: e["star_rating"] != 3)
processed_dataset = processed_dataset.filter(lambda x: isinstance(x["text"], str) and len(x["text"]) > 0)

Filter:   0%|          | 0/62366 [00:00<?, ? examples/s]

In [31]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [32]:
processed_dataset.column_names


['text', 'label']

In [33]:
def tokenize_function(batch):
    texts = batch["text"]
    if not isinstance(texts, list):
        texts = [texts]
    texts = [str(t) if t is not None else "" for t in texts]
    return tokenizer(texts, padding="max_length", truncation=True)


tokenized = processed_dataset.map(tokenize_function, batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/62365 [00:00<?, ? examples/s]

In [37]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [45]:
args = TrainingArguments(
    output_dir="./bert-novex-sentiment",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    report_to="none", 
    push_to_hub=True,  
    hub_model_id="eidrieenbe/bert-novex-reviews"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    eval_dataset=tokenized.select(range(1000)),  
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.1389,0.079118,0.977




In [47]:
from transformers import pipeline

model_name = "eidrieenbe/bert-novex-reviews"  

classifier = pipeline("text-classification", model=model_name, tokenizer="bert-base-uncased")

review = "This product is amazing! Really exceeded my expectations."
prediction = classifier(review)
print(prediction)


config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9990335702896118}]
