In [1]:
import os
import logging
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datetime import datetime
from preprocessing.text_processor import TextProcessor

In [2]:
data_path = "data/labeled/reviews.csv"
teencode_path = "resources/teencode.csv"
stopwords_path = "resources/stopwords.txt"
phrases_path = "resources/phrase_mapping.csv"
output_path = "models"

In [3]:
# Initialize text processor
processor = TextProcessor(
    teencode_path=teencode_path,
    stopword_path=stopwords_path,
    phrase_mapping_path=phrases_path,
)

In [4]:
processor.preprocess("t·∫•t c·∫£ r·∫•t okk")

't·∫•t_c·∫£ r·∫•t ·ªïn'

In [5]:
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer

# T·∫£i m√¥ h√¨nh v√† tokenizer
model = RobertaForSequenceClassification.from_pretrained(
    "wonrax/phobert-base-vietnamese-sentiment"
)
tokenizer = AutoTokenizer.from_pretrained(
    "wonrax/phobert-base-vietnamese-sentiment", use_fast=False
)

In [6]:
# import torch
# from transformers import RobertaForSequenceClassification, AutoTokenizer
# from underthesea import word_tokenize  # Ho·∫∑c d√πng pyvi n·∫øu b·∫°n th√≠ch

# comments = [
#     "Giao h√†ng nhanh, s·∫£n ph·∫©m ƒë√∫ng nh∆∞ m√¥ t·∫£.",
#     "S√°ch b·ªã r√°ch v√† giao sai h√†ng.",
#     "Ch·∫•t l∆∞·ª£ng ·ªïn, gi√° c·∫£ h·ª£p l√Ω.",
#     "Th√°i ƒë·ªô nh√¢n vi√™n kh√¥ng t·ªët.",
#     "ƒê√≥ng g√≥i c·∫©n th·∫≠n, r·∫•t h√†i l√≤ng.",
#     "Xu·∫•t s·∫Øc",
# ]

# # Mapping ID ‚Üí Label
# id2label = {0: "neg", 1: "pos", 2: "neu"}

# print("üì¶ D·ª± ƒëo√°n c·∫£m x√∫c:")
# for cmt in comments:
#     segmented = word_tokenize(cmt, format="text")  # T√°ch t·ª´ b·∫±ng underthesea
#     input_ids = torch.tensor([tokenizer.encode(segmented)])
#     with torch.no_grad():
#         outputs = model(input_ids)
#         probs = outputs.logits.softmax(dim=-1).squeeze()
#         top_id = torch.argmax(probs).item()
#     label = id2label[top_id]
#     print(f'üó®Ô∏è "{cmt}" ‚Üí üìå {label}')

In [7]:
df = pd.read_csv(data_path)

In [8]:
df.head()

Unnamed: 0,productID,userId,rating,comment,label
0,74021317,7991785,5,M·ªôt quy·ªÉn s√°ch hay,pos
1,187827003,18150739,5,"M√¨nh ƒë√£ t·ª´ng ƒë·ªçc s∆° n·ªôi dung s√°ch, r·∫•t hay, r·∫•...",pos
2,271380890,497788,5,"Quy·ªÉn s√°ch ƒë·∫πp v·ªÅ h√¨nh th·ª©c, n·ªôi dung m·ªõi ƒë·ªçc ...",pos
3,74021317,19165924,5,"S√°ch ƒë·∫πp, h√†i l√≤ng",pos
4,105483727,10170816,5,"s√°ch ƒë√≥ng g√≥i c·∫©n th·∫≠n, giao h√†nh nhanh",pos


In [9]:
# from sklearn.metrics import classification_report
# import torch

# true = []
# pred = []

# label2id = {"neg": 0, "pos": 1, "neu": 2}
# id2label = {v: k for k, v in label2id.items()}

# for row in df.itertuples(index=False, name=None):
#     prdId, userId, rating, comment, label = row

#     # Tokenize with proper padding and truncation
#     inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)

#     with torch.no_grad():
#         outputs = model(**inputs)
#         probs = outputs.logits.softmax(dim=-1).squeeze()
#         top_id = torch.argmax(probs).item()

#     pred_label = id2label[top_id]

#     # Append ground truth and prediction
#     true.append(label2id[label.lower()])
#     pred.append(label2id[pred_label])

# # Print evaluation report
# print(classification_report(true, pred, target_names=["neg", "pos", "neu"]))

In [10]:
df = df[df['label'] != 'neu'].dropna(subset='comment')
df.reset_index(drop=True)

Unnamed: 0,productID,userId,rating,comment,label
0,74021317,7991785,5,M·ªôt quy·ªÉn s√°ch hay,pos
1,187827003,18150739,5,"M√¨nh ƒë√£ t·ª´ng ƒë·ªçc s∆° n·ªôi dung s√°ch, r·∫•t hay, r·∫•...",pos
2,271380890,497788,5,"Quy·ªÉn s√°ch ƒë·∫πp v·ªÅ h√¨nh th·ª©c, n·ªôi dung m·ªõi ƒë·ªçc ...",pos
3,74021317,19165924,5,"S√°ch ƒë·∫πp, h√†i l√≤ng",pos
4,105483727,10170816,5,"s√°ch ƒë√≥ng g√≥i c·∫©n th·∫≠n, giao h√†nh nhanh",pos
...,...,...,...,...,...
4019,47161879,8006396,1,G·ª≠i 2 quy·ªÉn gi·ªëng y h·ªát nhau trong khi m√¨nh ƒë·∫∑...,neg
4020,4080373,11079688,1,Mua tiki r·∫•t nhi·ªÅu l·∫ßn m√† ƒë√≥ng g√≥i v·∫≠y l√† th·∫•y...,neg
4021,3953475,7940086,1,Nh·∫≠n thi·∫øu 1 quy·ªÉn. Tiki thu h·ªìi l·∫°i t·ª´ ng√†y 1...,neg
4022,3953475,6206297,1,C√°ch ƒë√≥ng g√≥i kh√¥ng th·ªÉ ch·∫•p nh·∫≠n ƒë∆∞·ª£c,neg


In [11]:
from sklearn.metrics import classification_report
import torch
import pandas as pd

true = []
pred = []

label2id = {"neg": 0, "pos": 1, "neu": 2}
id2label = {v: k for k, v in label2id.items()}

wrong_predictions = [] 

for row in df.itertuples(index=False, name=None):
    prdId, userId, rating, comment, label = row

    # processed = processor.preprocess(comment)
    # Tokenize
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = outputs.logits.softmax(dim=-1).squeeze()
        top_id = torch.argmax(probs).item()

    pred_label = "pos" if id2label[top_id] == "neu" else id2label[top_id]

    # Append ground truth v√† prediction
    true_label_id = label2id[label.lower()]
    pred_label_id = label2id[pred_label]

    true.append(true_label_id)
    pred.append(pred_label_id)

    if pred_label_id != true_label_id:
        wrong_predictions.append(
            {
                "prdId": prdId,
                "userId": userId,
                "rating": rating,
                "comment": comment,
                "true_label": label.lower(),
                "predicted_label": pred_label,
            }
        )

print(classification_report(true, pred, target_names=["neg", "pos"]))

df_wrong = pd.DataFrame(wrong_predictions)
df_wrong.to_csv(
    "misclassified_reviews.csv", index=False, encoding="utf-8-sig"
)

              precision    recall  f1-score   support

         neg       0.92      0.79      0.85      2113
         pos       0.80      0.92      0.85      1911

    accuracy                           0.85      4024
   macro avg       0.86      0.85      0.85      4024
weighted avg       0.86      0.85      0.85      4024

