In [1]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch 
# Load dataset
dataset = load_dataset('csv', data_files='labeled_reviews_en.csv')
# Convert to pandas DataFrame
df = dataset['train'].to_pandas()

# Split dataset into train and test DataFrames
train_df, test_df = train_test_split(df, test_size=0.1)

# Convert DataFrames back to Hugging Face datasets
train_dataset = dataset['train'].class_encode_column("label").from_pandas(train_df)
test_dataset = dataset['train'].class_encode_column("label").from_pandas(test_df)

labels = train_dataset['label']  # This should be an array of integers representing your labels

# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(labels),
    y=labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer
# Load tokenizer
#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Preprocess function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

# Apply preprocessing to datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Creating a DatasetDict
tokenized_datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

Map: 100%|██████████| 351/351 [00:00<00:00, 25299.46 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 10052.26 examples/s]


In [3]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained(
    "nlptown/bert-base-multilingual-uncased-sentiment",
    num_labels=num_labels,ignore_mismatched_sizes=True,
    id2label={0: "None", 1: "Other", 2: "Incorrect size", 3: "Lack of instructions"},
    label2id={"None": 0, "Other": 1, "Incorrect size": 2, "Lack of instructions": 3}
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./models/review_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Define metric for evaluation
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(-1)
    return accuracy.compute(predictions=predictions, references=labels)

## Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["test"],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # Convert class weights to the same device as logits
        class_weights_device = class_weights.to(logits.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_device)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 17.3MB/s]


In [6]:
trainer.train()

                                                
 10%|█         | 44/440 [00:21<05:03,  1.30it/s]

{'eval_loss': 1.0518457889556885, 'eval_accuracy': 0.725, 'eval_runtime': 0.3959, 'eval_samples_per_second': 101.037, 'eval_steps_per_second': 12.63, 'epoch': 1.0}


                                                
 20%|██        | 88/440 [00:35<01:30,  3.90it/s]

{'eval_loss': 0.995893657207489, 'eval_accuracy': 0.7, 'eval_runtime': 0.2863, 'eval_samples_per_second': 139.727, 'eval_steps_per_second': 17.466, 'epoch': 2.0}


                                                 
 30%|███       | 132/440 [00:50<01:18,  3.90it/s]

{'eval_loss': 1.2797338962554932, 'eval_accuracy': 0.7, 'eval_runtime': 0.2873, 'eval_samples_per_second': 139.204, 'eval_steps_per_second': 17.4, 'epoch': 3.0}


                                                 
 40%|████      | 176/440 [01:04<01:07,  3.93it/s]

{'eval_loss': 1.1215875148773193, 'eval_accuracy': 0.775, 'eval_runtime': 0.2861, 'eval_samples_per_second': 139.809, 'eval_steps_per_second': 17.476, 'epoch': 4.0}


                                                 
 50%|█████     | 220/440 [01:18<00:59,  3.71it/s]

{'eval_loss': 1.0500085353851318, 'eval_accuracy': 0.775, 'eval_runtime': 0.2879, 'eval_samples_per_second': 138.953, 'eval_steps_per_second': 17.369, 'epoch': 5.0}


                                                 
 60%|██████    | 264/440 [01:33<00:45,  3.90it/s]

{'eval_loss': 0.9944307208061218, 'eval_accuracy': 0.775, 'eval_runtime': 0.2859, 'eval_samples_per_second': 139.908, 'eval_steps_per_second': 17.489, 'epoch': 6.0}


                                                 
 70%|███████   | 308/440 [01:47<00:33,  3.92it/s]

{'eval_loss': 1.1747077703475952, 'eval_accuracy': 0.75, 'eval_runtime': 0.3055, 'eval_samples_per_second': 130.943, 'eval_steps_per_second': 16.368, 'epoch': 7.0}


                                                 
 80%|████████  | 352/440 [02:01<00:22,  3.90it/s]

{'eval_loss': 1.1956474781036377, 'eval_accuracy': 0.775, 'eval_runtime': 0.2856, 'eval_samples_per_second': 140.033, 'eval_steps_per_second': 17.504, 'epoch': 8.0}


                                                 
 90%|█████████ | 396/440 [02:15<00:11,  3.92it/s]

{'eval_loss': 1.2687416076660156, 'eval_accuracy': 0.775, 'eval_runtime': 0.2868, 'eval_samples_per_second': 139.491, 'eval_steps_per_second': 17.436, 'epoch': 9.0}


                                                 
100%|██████████| 440/440 [02:30<00:00,  3.90it/s]

{'eval_loss': 1.2940075397491455, 'eval_accuracy': 0.775, 'eval_runtime': 0.2849, 'eval_samples_per_second': 140.386, 'eval_steps_per_second': 17.548, 'epoch': 10.0}


100%|██████████| 440/440 [02:33<00:00,  2.86it/s]

{'train_runtime': 153.6133, 'train_samples_per_second': 22.85, 'train_steps_per_second': 2.864, 'train_loss': 0.4938087463378906, 'epoch': 10.0}





TrainOutput(global_step=440, training_loss=0.4938087463378906, metrics={'train_runtime': 153.6133, 'train_samples_per_second': 22.85, 'train_steps_per_second': 2.864, 'total_flos': 126264740565600.0, 'train_loss': 0.4938087463378906, 'epoch': 10.0})

In [7]:
trainer.save_model("./models/review_classifier")

# To load:
model = AutoModelForSequenceClassification.from_pretrained("./models/review_classifier")

In [10]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# review = "Nice cut but I only recommend it to experienced hares. It's a pity that there is no information about the addition of food. You only find out about this after you buy it, so anyone who buys it should bear this in mind. Therefore, one point deduction."
review = "Very easy to knit!"
review = "Instructions were quite complicated, I couldn't do it."
print(classifier(review))

[{'label': 'Lack of instructions', 'score': 0.5856142640113831}]


In [None]:
import pandas as pd 
from tqdm import trange
data_reviews = pd.read_csv('reviews_with_sentiment3.csv')
data_reviews["problem_category"] = ""
num_reviews = len(data_reviews["text"])
for i in trange(num_reviews): 
    designer_review = data_reviews["translated_text"][i]
    problem_category = classifier(designer_review)[0]["label"]
    if problem_category == "None":
        problem_category = "No problem"
    data_reviews.loc[i, "problem_category"] = problem_category

data_reviews.to_csv("reviews_with_sentiment_with_problem_category.csv")

In [None]:
data_reviews2 = pd.read_csv("reviews_with_sentiment_with_problem_category.csv")
data_reviews2