# Import Library

In [1]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report
import numpy as np
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


# 모델 로드

In [2]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## 모델 클래스 정의

In [3]:
class CustomRobertaForMultiLabelClassification(nn.Module):
    def __init__(self, num_labels_per_category=3, num_categories=5):
        super(CustomRobertaForMultiLabelClassification, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels_per_category)
        self.num_categories = num_categories
        self.classifiers = nn.ModuleList([nn.Linear(self.roberta.config.hidden_size, num_labels_per_category) for _ in range(num_categories)])

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = torch.stack([classifier(outputs.last_hidden_state[:, 0, :]) for classifier in self.classifiers], dim=1)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            losses = [loss_fct(logits[:, i, :], labels[:, i]) for i in range(self.num_categories)]
            loss = sum(losses) / self.num_categories
        
        return {"loss": loss, "logits": logits}


In [4]:
model_folder = "../model"
model_file = "custom_roberta_model_1000_3epoch.pth"  # 사용할 모델 파일 이름
model_path = os.path.join(model_folder, model_file)

# 모델 로드
model = CustomRobertaForMultiLabelClassification()
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()
print(f"Loaded model: {model_file}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=device))


Loaded model: custom_roberta_model_1000_3epoch.pth


## 토크나이저 로드

In [5]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# predict 함수

In [6]:
def predict(model, sentence, tokenizer, device):
    model.eval()
    
    inputs = tokenizer(
        sentence,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs['logits']
        predictions = torch.argmax(logits, dim=-1).cpu().numpy().squeeze() - 1
    
    labels = ['Product_Quality', 'Delivery', 'Price', 'Repurchase_Intention', 'etc']
    prediction_dict = {label: pred for label, pred in zip(labels, predictions)}
    
    return prediction_dict

# Data Load

In [43]:
df = pd.read_csv("../data/comment_translation_labeling.csv")

In [44]:
texts = df.loc[1200:, 'review_comment_message_en'].tolist()

In [7]:
df = pd.read_csv("../data/comment_translation.csv")

In [9]:
df

Unnamed: 0,review_comment_message,review_comment_message_en
0,"Produto preto, reembalado na caixa de um branc...","Black product, repacked in a white box, lots o..."
1,FICAMOS ESPERANDO UMA ENCOMENDA PAGA QUE NUNCA...,WE'RE STILL EXPERIENCED WITH A RECOMMENDATION ...
2,Otimo muito bom,Very good.
3,Entrega antes do prazo. Produto muito bom e bo...,"It arrives by the deadline. Very good product,..."
4,excelente.,excellent.
...,...,...
35170,"Muito obrigada , uma atenção nota 10, profissi...","Thank you very much, a note 10 for your attent..."
35171,Produto entregue dentro do prazo . Gostei muito.,I got it delivered on time. I appreciate it.
35172,Essa capa não encaixa direito. Não consigo tro...,That layer doesn't fit right. I can't change it.
35173,"Mim deixou esperar muito tempo, para depois vi...",So he waited a long time for me to let him kno...


In [10]:
texts = df.loc[:,'review_comment_message_en'].tolist()

# Labeling

## 오류 발생으로 오류 해결
- 오류 사유 null 값 존재

In [11]:
df.loc[1200:, 'review_comment_message_en'].isnull().sum()

0

In [46]:
t = pd.DataFrame(df.loc[1200:, 'review_comment_message_en'])

In [47]:
# null 값이 포함된 행 찾기
rows_with_null = t[t.isnull().any(axis=1)]

# null 값이 포함된 행 출력
print("Rows with null values:\n", rows_with_null)

Rows with null values:
       review_comment_message_en
21811                       NaN
28507                       NaN


In [48]:
df.iloc[21811]

review_comment_message       Nenhuma
review_comment_message_en        NaN
Product_Quality                  NaN
Delivery                         NaN
Price                            NaN
Repurchase_Intention             NaN
etc                              NaN
Name: 21811, dtype: object

In [49]:
df.iloc[28507]

review_comment_message       N/D
review_comment_message_en    NaN
Product_Quality              NaN
Delivery                     NaN
Price                        NaN
Repurchase_Intention         NaN
etc                          NaN
Name: 28507, dtype: object

In [50]:
df.iloc[21811,1] = 'None'
df.iloc[28507,1] = 'N/A'

In [52]:
df.iloc[21811]

review_comment_message       Nenhuma
review_comment_message_en       None
Product_Quality                  NaN
Delivery                         NaN
Price                            NaN
Repurchase_Intention             NaN
etc                              NaN
Name: 21811, dtype: object

In [51]:
df.iloc[28507]

review_comment_message       N/D
review_comment_message_en    N/A
Product_Quality              NaN
Delivery                     NaN
Price                        NaN
Repurchase_Intention         NaN
etc                          NaN
Name: 28507, dtype: object

In [53]:
texts = df.loc[1200:, 'review_comment_message_en'].tolist()

## predict label

In [12]:
predicted_labels = []
i = -1
for sentence in texts:
    i += 1
    try:
        prediction = predict(model, sentence, tokenizer, device)
        predicted_labels.append([
            prediction['Product_Quality'],
            prediction['Delivery'],
            prediction['Price'],
            prediction['Repurchase_Intention'],
            prediction['etc']
        ])
    except:
        print(i)
        print(sentence)

In [56]:
# 예측 결과를 기존 DataFrame에 직접 추가
df.loc[1200:, 'Product_Quality'] = [label[0] for label in predicted_labels]
df.loc[1200:, 'Delivery'] = [label[1] for label in predicted_labels]
df.loc[1200:, 'Price'] = [label[2] for label in predicted_labels]
df.loc[1200:, 'Repurchase_Intention'] = [label[3] for label in predicted_labels]
df.loc[1200:, 'etc'] = [label[4] for label in predicted_labels]


In [15]:
# 예측 결과를 기존 DataFrame에 직접 추가
df.loc[:, 'Product_Quality'] = [label[0] for label in predicted_labels]
df.loc[:, 'Delivery'] = [label[1] for label in predicted_labels]
df.loc[:, 'Price'] = [label[2] for label in predicted_labels]
df.loc[:, 'Repurchase_Intention'] = [label[3] for label in predicted_labels]
df.loc[:, 'etc'] = [label[4] for label in predicted_labels]


In [57]:

print(df.loc[1200:, ['review_comment_message_en', 'Product_Quality', 'Delivery', 'Price', 'Repurchase_Intention', 'etc']].head())

                              review_comment_message_en  Product_Quality  \
1200  The entire note was not delivered, I bought tw...              0.0   
1201  I purchased two vases, but only received one v...              0.0   
1202  I received the product different from what was...             -1.0   
1203  I bought a scale but it was damaged, I changed...             -1.0   
1204                         I will return the product.             -1.0   

      Delivery  Price  Repurchase_Intention  etc  
1200       0.0    0.0                   0.0 -1.0  
1201       0.0    0.0                   0.0 -1.0  
1202       0.0    0.0                   0.0  0.0  
1203       0.0    0.0                   0.0  0.0  
1204       0.0    0.0                   0.0  0.0  


# csv 저장

In [17]:
df.to_csv("../data/comment_translation_predict_label.csv", index=False)

In [16]:
df.head()

Unnamed: 0,review_comment_message,review_comment_message_en,Product_Quality,Delivery,Price,Repurchase_Intention,etc
0,"Produto preto, reembalado na caixa de um branc...","Black product, repacked in a white box, lots o...",-1,0,0,0,0
1,FICAMOS ESPERANDO UMA ENCOMENDA PAGA QUE NUNCA...,WE'RE STILL EXPERIENCED WITH A RECOMMENDATION ...,0,0,0,0,-1
2,Otimo muito bom,Very good.,0,0,0,0,1
3,Entrega antes do prazo. Produto muito bom e bo...,"It arrives by the deadline. Very good product,...",1,1,0,0,0
4,excelente.,excellent.,0,0,0,0,1
