# import library

In [26]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report
import numpy as np
import pandas as pd
import os

# Î™®Îç∏ Ï†ïÏùò

In [27]:
# Dataset class
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        labels = [label + 1 for label in labels]
        encoding['labels'] = torch.tensor(labels, dtype=torch.long)
        return encoding

# Model class
class CustomRobertaForMultiLabelClassification(nn.Module):
    def __init__(self, num_labels_per_category=3, num_categories=5):
        super(CustomRobertaForMultiLabelClassification, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels_per_category)
        self.num_categories = num_categories
        self.classifiers = nn.ModuleList([nn.Linear(self.roberta.config.hidden_size, num_labels_per_category) for _ in range(num_categories)])

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = torch.stack([classifier(outputs.last_hidden_state[:, 0, :]) for classifier in self.classifiers], dim=1)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            losses = [loss_fct(logits[:, i, :], labels[:, i]) for i in range(self.num_categories)]
            loss = sum(losses) / self.num_categories
        
        return {"loss": loss, "logits": logits}


# Ìï®Ïàò Ï†ïÏùò

## def train

In [28]:
# Training function
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss



## def prediction

In [30]:
# Prediction function
def predict(model, sentence, tokenizer, device):
    model.eval()
    
    inputs = tokenizer(
        sentence,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs['logits']
        predictions = torch.argmax(logits, dim=-1).cpu().numpy().squeeze() - 1
    
    labels = ['Product_Quality', 'Delivery', 'Price', 'Repurchase_Intention', 'etc']
    prediction_dict = {label: pred for label, pred in zip(labels, predictions)}
    

    return prediction_dict

# Data Load

In [31]:
data = pd.read_csv('../data/comment_translation_labeling.csv')
data.head()

Unnamed: 0,review_comment_message,review_comment_message_en,Product_Quality,Delivery,Price,Repurchase_Intention,etc
0,"Produto preto, reembalado na caixa de um branc...","Black product, repackaged in a white box. Very...",0.0,-1.0,0.0,0.0,0.0
1,FICAMOS ESPERANDO UMA ENCOMENDA PAGA QUE NUNCA...,WE ARE WAITING FOR A PAID ORDER THAT NEVER ARR...,0.0,-1.0,0.0,0.0,-1.0
2,Otimo muito bom,Excellent very good,0.0,0.0,0.0,0.0,1.0
3,Entrega antes do prazo. Produto muito bom e bo...,Delivery ahead of schedule. Very good and beau...,1.0,1.0,0.0,0.0,0.0
4,excelente.,excellent.,0.0,0.0,0.0,0.0,1.0


## ÎùºÎ≤®ÎßÅ ÎêòÏñ¥ ÏûàÎäî data Ï∂îÏ∂ú

In [45]:
df = data.iloc[:1100,:]
df.tail()

Unnamed: 0,review_comment_message,review_comment_message_en,Product_Quality,Delivery,Price,Repurchase_Intention,etc
1095,muito bom!,very good!,0.0,0.0,0.0,0.0,1.0
1096,"Produto comprado foi Moringa em c√°psulas, a qu...","The product purchased was Moringa in capsules,...",-1.0,0.0,0.0,0.0,0.0
1097,Loja nota 10,Store note 10,0.0,0.0,0.0,0.0,1.0
1098,M/ refer√™ncia se dar√° s√≥ qto a compra e entreg...,The reference will only be given when the prod...,0.0,1.0,0.0,0.0,0.0
1099,Muito bom chegou antes do previsto,Very good arrived earlier than expected,0.0,1.0,0.0,0.0,1.0


## textÏôÄ label Î¶¨Ïä§Ìä∏ Ï∂îÏ∂ú

In [46]:
texts = df['review_comment_message_en'].tolist()
labels = df[['Product_Quality', 'Delivery', 'Price', 'Repurchase_Intention', 'etc']].values.tolist()

print("Texts:", texts)
print("Labels:", labels)

Texts: ["Black product, repackaged in a white box. Very dented at the bottom in a way that no one would notice, it doesn't stabilize. Box in perfect condition, was crushed before being delivered to the post office.", 'WE ARE WAITING FOR A PAID ORDER THAT NEVER ARRIVED AND THE PURCHASE VALUE HAS STILL NOT BEEN REFUNDED', 'Excellent very good', 'Delivery ahead of schedule. Very good and beautiful product.', 'excellent.', 'Delivery came in half', "I ordered two tubs and only received one. I'm waiting for an answer on how I can resolve this, but the delivery was on time and the one I received is of good quality.", 'Product delivered in perfect condition and well before the stipulated delivery date', 'Great product', "Good evening, I received the goods partially because a tub is broken, what should I do? I'm at a loss or who is going to replace this broken situation?", 'Ameiiii!!! excellent product...excellent quality, delivered very well ahead of schedule, thank you', 'In economic shipping

# ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú Î∞è Ï≤òÎ¶¨

In [47]:
# # Sample data
# texts = ["Black product, repackaged in a white box...", "WE ARE WAITING FOR A PAID ORDER..."]
# labels = [[0, -1, 0, 0, 0], [0, -1, 0, 0, -1]]
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
dataset = MultiLabelDataset(texts, labels, tokenizer)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Î™®Îç∏ Ï¥àÍ∏∞Ìôî

In [48]:
model = CustomRobertaForMultiLabelClassification()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# train

In [49]:
num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    avg_loss = train(model, train_loader, optimizer, device)
    print(f"Training Loss: {avg_loss}")

Epoch 1/5


Training:   0%|          | 0/138 [00:00<?, ?it/s]

                                                                       

Training Loss: 0.515200050736683
Epoch 2/5


                                                                       

Training Loss: 0.27009070891401044
Epoch 3/5


                                                                        

Training Loss: 0.1852645972824615
Epoch 4/5


                                                                        

Training Loss: 0.1337185048268757
Epoch 5/5


                                                                        

Training Loss: 0.10150474597416494




# Î™®Îç∏ Ï†ÄÏû•

In [50]:
# Î™®Îç∏ Í∞ÄÏ§ëÏπòÎ•º Ï†ÄÏû•Ìï† Í≤ΩÎ°ú ÏßÄÏ†ï
model_save_path = "../model/custom_roberta_model_1100_5epoch.pth"

# Î™®Îç∏ Í∞ÄÏ§ëÏπò Ï†ÄÏû•
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to ../model/custom_roberta_model_1100_5epoch.pth


# ÌååÏù∏ÌäúÎãù Î™®Îç∏ LOAD

## model load Ìï®Ïàò

In [41]:
def load_model(model_path, device = torch.device("cuda" if torch.cuda.is_available() else "cpu")):
    model = CustomRobertaForMultiLabelClassification()
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()  # ÌèâÍ∞Ä Î™®ÎìúÎ°ú ÏÑ§Ï†ï
    print(f"Model loaded from {model_path}")
    return model

# f1 score

## Ìï®Ïàò

Ï†ÑÏ≤¥ ÎÇòÏó¥, ÎùºÎ≤®Î≥Ñ, Î¶¨Î∑∞Î≥Ñ ÏàúÏúºÎ°ú f1 score Î¶¨ÌÑ¥

In [14]:
# 3Í∞ÄÏßÄ F1-Score Í≥ÑÏÇ∞ Ìï®Ïàò Ï†ïÏùò
def calculate_f1_scores(test_labels, test_predictions):
    # Ï†ÑÏ≤¥ Î†àÏù¥Î∏îÏùÑ ÌèâÌÉÑÌôîÌïòÏó¨ F1-Score Í≥ÑÏÇ∞
    flattened_true_labels = [label for sublist in test_labels for label in sublist]
    flattened_predictions = [pred for sublist in test_predictions for pred in sublist]
    overall_f1 = f1_score(flattened_true_labels, flattened_predictions, average="micro")

    # Í∞Å Î†àÏù¥Î∏îÎ≥Ñ F1-Score Í≥ÑÏÇ∞
    label_names = ['Product_Quality', 'Delivery', 'Price', 'Repurchase_Intention', 'etc']
    label_f1_scores = {}
    for i, label_name in enumerate(label_names):
        true_labels = [label[i] for label in test_labels]
        pred_labels = [pred[i] for pred in test_predictions]
        label_f1 = f1_score(true_labels, pred_labels, average="micro")
        label_f1_scores[label_name] = label_f1

    # Î¶¨Î∑∞ Îã®ÏúÑ Ï†ïÌôïÌïú Îß§Ïπ≠ Í∏∞Î∞ò F1-Score Í≥ÑÏÇ∞
    review_matches = [1 if true == pred else 0 for true, pred in zip(test_labels, test_predictions)]
    true_review_matches = [1] * len(review_matches)
    review_level_f1 = f1_score(true_review_matches, review_matches, average="binary")

    return overall_f1, label_f1_scores, review_level_f1

## test data load

In [54]:
test_df = data.iloc[1100:1200, :]
test_df.tail()


Unnamed: 0,review_comment_message,review_comment_message_en,Product_Quality,Delivery,Price,Repurchase_Intention,etc
1195,"Maravilhoso, e a entrega super rapido, recomendo.","Wonderful, and super fast delivery, I recommen...",0.0,1.0,0.0,1.0,0.0
1196,chegou tudo certo,everything arrived ok,0.0,0.0,0.0,0.0,1.0
1197,parece bom ainda n√£o usei.,"It looks good, I haven't used it yet.",1.0,0.0,0.0,0.0,0.0
1198,"n√£o recebi o produto completo como deveria, co...",I didn't receive the complete product as I sho...,0.0,0.0,0.0,0.0,1.0
1199,√≥timo produto.\nchegou bem r√°pido .\nem menos ...,great product.\narrived very quickly.\nin less...,1.0,1.0,0.0,0.0,0.0


In [55]:
# Î¶¨Î∑∞ ÌÖçÏä§Ìä∏ÏôÄ Î†àÏù¥Î∏î Ï∂îÏ∂ú
test_texts = test_df['review_comment_message_en'].tolist()
test_labels = test_df[['Product_Quality', 'Delivery', 'Price', 'Repurchase_Intention', 'etc']].values.tolist()


## Îã®Ïùº Î™®Îç∏ f1-score

### Î™®Îç∏ Î∂àÎü¨Ïò§Í∏∞

In [56]:
model_path = "../model/custom_roberta_model_1100_5epoch.pth"

In [57]:
loaded_model = load_model(model_path)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=device))


Model loaded from ../model/custom_roberta_model_1100_5epoch.pth


### f1 score Ï∂úÎ†•(Îã®Ïùº Î™®Îç∏)

In [58]:
# ÏòàÏ∏° ÏàòÌñâ
test_predictions = []
for sentence in test_texts:
    prediction = predict(loaded_model, sentence, tokenizer, device)
    test_predictions.append([
        prediction['Product_Quality'],
        prediction['Delivery'],
        prediction['Price'],
        prediction['Repurchase_Intention'],
        prediction['etc']
    ])

# 3. F1-Score Í≥ÑÏÇ∞
overall_f1, label_f1_scores, review_level_f1 = calculate_f1_scores(test_labels, test_predictions)

# Í≤∞Í≥º Ï∂úÎ†•
print(f"Results for model {model_path}:")
print(f"  Overall F1-Score: {overall_f1:.4f}")
for label_name, f1 in label_f1_scores.items():
    print(f"  {label_name} F1-Score: {f1:.4f}")
print(f"  Review-level Exact Match F1-Score: {review_level_f1:.4f}")
print("--------------------------------------------------")

Results for model ../model/custom_roberta_model_1100_5epoch.pth:
  Overall F1-Score: 0.9100
  Product_Quality F1-Score: 0.9000
  Delivery F1-Score: 0.9000
  Price F1-Score: 0.9900
  Repurchase_Intention F1-Score: 0.9800
  etc F1-Score: 0.7800
  Review-level Exact Match F1-Score: 0.7879
--------------------------------------------------


## Î™®Îì† Î™®Îç∏ f1-score

### Î™®Îç∏ Î¶¨Ïä§Ìä∏ Î∂àÎü¨Ïò§Í∏∞

In [40]:
model_folder = "../model"
model_files = [f for f in os.listdir(model_folder) if f.endswith(".pth")]


### f1 score Ï∂úÎ†• (Î™®Îì† Î™®Îç∏)

In [25]:
for model_file in tqdm(model_files):
    # Î™®Îç∏ Î∂àÎü¨Ïò§Í∏∞
    model_path = os.path.join(model_folder, model_file)
    model = CustomRobertaForMultiLabelClassification()
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    print(f"\nLoaded model: {model_file}")

    # ÏòàÏ∏° ÏàòÌñâ
    test_predictions = []
    for sentence in test_texts:
        prediction = predict(model, sentence, tokenizer, device)
        test_predictions.append([
            prediction['Product_Quality'],
            prediction['Delivery'],
            prediction['Price'],
            prediction['Repurchase_Intention'],
            prediction['etc']
        ])

    # 3. F1-Score Í≥ÑÏÇ∞
    overall_f1, label_f1_scores, review_level_f1 = calculate_f1_scores(test_labels, test_predictions)

    # Í≤∞Í≥º Ï∂úÎ†•
    print(f"Results for model {model_file}:")
    print(f"  Overall F1-Score: {overall_f1:.4f}")
    for label_name, f1 in label_f1_scores.items():
        print(f"  {label_name} F1-Score: {f1:.4f}")
    print(f"  Review-level Exact Match F1-Score: {review_level_f1:.4f}")
    print("--------------------------------------------------")

  0%|          | 0/5 [00:00<?, ?it/s]Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=device))



Loaded model: custom_roberta_model_1000_3epoch.pth


 20%|‚ñà‚ñà        | 1/5 [01:31<06:06, 91.61s/it]

Results for model custom_roberta_model_1000_3epoch.pth:
  Overall F1-Score: 0.9060
  Product_Quality F1-Score: 0.8500
  Delivery F1-Score: 0.8950
  Price F1-Score: 0.9800
  Repurchase_Intention F1-Score: 0.9700
  etc F1-Score: 0.8350
  Review-level Exact Match F1-Score: 0.7879
--------------------------------------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=device))



Loaded model: custom_roberta_model_600.pth


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [02:56<04:23, 87.67s/it]

Results for model custom_roberta_model_600.pth:
  Overall F1-Score: 0.8980
  Product_Quality F1-Score: 0.8750
  Delivery F1-Score: 0.9000
  Price F1-Score: 0.9800
  Repurchase_Intention F1-Score: 0.9550
  etc F1-Score: 0.7800
  Review-level Exact Match F1-Score: 0.7578
--------------------------------------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=device))



Loaded model: custom_roberta_model_700.pth


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [04:20<02:51, 85.76s/it]

Results for model custom_roberta_model_700.pth:
  Overall F1-Score: 0.8990
  Product_Quality F1-Score: 0.8650
  Delivery F1-Score: 0.8800
  Price F1-Score: 0.9800
  Repurchase_Intention F1-Score: 0.9550
  etc F1-Score: 0.8150
  Review-level Exact Match F1-Score: 0.7768
--------------------------------------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=device))



Loaded model: custom_roberta_model_800.pth


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [05:45<01:25, 85.46s/it]

Results for model custom_roberta_model_800.pth:
  Overall F1-Score: 0.8980
  Product_Quality F1-Score: 0.8550
  Delivery F1-Score: 0.9000
  Price F1-Score: 0.9850
  Repurchase_Intention F1-Score: 0.9700
  etc F1-Score: 0.7800
  Review-level Exact Match F1-Score: 0.7692
--------------------------------------------------


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=device))



Loaded model: custom_roberta_model_900.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [07:09<00:00, 85.93s/it]

Results for model custom_roberta_model_900.pth:
  Overall F1-Score: 0.8870
  Product_Quality F1-Score: 0.8800
  Delivery F1-Score: 0.8950
  Price F1-Score: 0.9800
  Repurchase_Intention F1-Score: 0.9400
  etc F1-Score: 0.7400
  Review-level Exact Match F1-Score: 0.7578
--------------------------------------------------





# Î™®Îç∏ ÏÑ±Îä• ÌèâÍ∞Ä

1. Results for model custom_roberta_model_600.pth:   
  - Overall F1-Score: 0.8980    
  - Product_Quality F1-Score: 0.8750    
  - Delivery F1-Score: 0.9000   
  - Price F1-Score: 0.9800    
  - Repurchase_Intention F1-Score: 0.9550   
  - etc F1-Score: 0.7800    
  - Review-level Exact Match F1-Score: 0.7578   
--------------------------------------------------

2. Results for model custom_roberta_model_700.pth:
  - Overall F1-Score: 0.8990
  - Product_Quality F1-Score: 0.8650
  - Delivery F1-Score: 0.8800
  - Price F1-Score: 0.9800
  - Repurchase_Intention F1-Score: 0.9550
  - etc F1-Score: 0.8150
  - Review-level Exact Match F1-Score: 0.7768
--------------------------------------------------

3. Results for model custom_roberta_model_800.pth:
  - Overall F1-Score: 0.8980
  - Product_Quality F1-Score: 0.8550
  - Delivery F1-Score: 0.9000
  - Price F1-Score: 0.9850
  - Repurchase_Intention F1-Score: 0.9700
  - etc F1-Score: 0.7800
  - Review-level Exact Match F1-Score: 0.7692
--------------------------------------------------

4. Results for model custom_roberta_model_900.pth:
  - Overall F1-Score: 0.8870
  - Product_Quality F1-Score: 0.8800
  - Delivery F1-Score: 0.8950
  - Price F1-Score: 0.9800
  - Repurchase_Intention F1-Score: 0.9400
  - etc F1-Score: 0.7400
  - Review-level Exact Match F1-Score: 0.7578
--------------------------------------------------

5. <b>Results for model custom_roberta_model_1000_3epoch.pth</b>:
  - Overall F1-Score: 0.9060
  - Product_Quality F1-Score: 0.8500
  - Delivery F1-Score: 0.8950
  - Price F1-Score: 0.9800
  - Repurchase_Intention F1-Score: 0.9700
  - <u>etc F1-Score: 0.8350</u>
  - Review-level Exact Match F1-Score: 0.7879
--------------------------------------------------

6. Results for model ../model/custom_roberta_model_1000_5epoch.pth:
  - Overall F1-Score: 0.9070
  - Product_Quality F1-Score: 0.9000
  - Delivery F1-Score: 0.9250
  - Price F1-Score: 0.9850
  - Repurchase_Intention F1-Score: 0.9500
  - etc F1-Score: 0.7750
  - Review-level Exact Match F1-Score: 0.7842
--------------------------------------------------

7. Results for model ../model/custom_roberta_model_1100_5epoch.pth:
  - Overall F1-Score: 0.9100
  - Product_Quality F1-Score: 0.9000
  - Delivery F1-Score: 0.9000
  - Price F1-Score: 0.9900
  - Repurchase_Intention F1-Score: 0.9800
  - etc F1-Score: 0.7800
  - Review-level Exact Match F1-Score: 0.7879
--------------------------------------------------

