In [2]:
# Notebook: Fine-Tune mô hình ResNet + PhoBERT + Metadata để phân loại image_helpfulness


# 1. Cài đặt và import

import os, json, requests
from io import BytesIO
from PIL import Image
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms, models
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

## 2. Đọc và tiền xử lý dữ liệu

# Đọc JSON
data_path = 'all_labeled.json'
with open(data_path) as f:
    raw = json.load(f)
# Chuyển thành DataFrame
records=[]
for item in raw:
    data=item['data']; anns=item.get('annotations',[])
    # Lấy nhãn
    lbl=None
    for ann in anns:
        for r in ann['result']:
            if r['from_name']=='Image_helpfulness': lbl=int(r['value']['choices'][0])
    if lbl is None or lbl<2: continue
    imgs=data.get('images',[])
    if not imgs: continue
    records.append({'url':imgs[0], 'name':data.get('product_name',''),
                    'rating':data.get('rating',0),'category':data.get('product_category',''),
                    'label':lbl})
df=pd.DataFrame(records)
# Loại bỏ các bản ghi không có URL hợp lệ
df=df[df['url'].apply(lambda x: isinstance(x, str) and x.startswith('http'))]
# Chuẩn hóa rating
scaler=MinMaxScaler(); df['rating_norm']=scaler.fit_transform(df[['rating']])
# Ánh xạ category
cats=df['category'].unique().tolist(); cat2idx={c:i for i,c in enumerate(cats)}
df['cat_idx']=df['category'].map(cat2idx)
# Chuyển nhãn về 0-3
df['label_idx']=df['label']-2
print(df['label_idx'].value_counts())

## 3. Dataset và Transform


# Tiết kiệm thời gian: tải trước tất cả ảnh về thư mục local
import os
os.makedirs('images', exist_ok=True)
local_paths = []
for idx, url in enumerate(df['url']):
    local_file = os.path.join('images', f'image_{idx}.jpg')
    if not os.path.exists(local_file):
        try:
            resp = requests.get(url, timeout=5)
            with open(local_file, 'wb') as f:
                f.write(resp.content)
        except Exception as e:
            print(f"Không tải được {url}: {e}")
    local_paths.append(local_file)
# Gán cột path trong DataFrame
df['img_path'] = local_paths

# Transform ảnh
mean=[0.485,0.456,0.406]; std=[0.229,0.224,0.225]
transform=transforms.Compose([
    transforms.Resize((224,224)), transforms.ToTensor(), transforms.Normalize(mean,std)
])
# Tokenizer PhoBERT
tokenizer=AutoTokenizer.from_pretrained('vinai/phobert-base')
# Dataset
torch.manual_seed(42)
class MultiModalDataset(Dataset):
    def __init__(self, df):
        self.df=df.reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self,i):
        row=self.df.iloc[i]
        # ảnh: mở từ local file thay vì tải mạng
        img = Image.open(row['img_path']).convert('RGB')
        img_t = transform(img)
        # text
        enc = tokenizer(row['name'], padding='max_length', truncation=True,
                        max_length=32, return_tensors='pt')
        input_ids = enc['input_ids'].squeeze(0)
        attn = enc['attention_mask'].squeeze(0)
        # metadata
        rating = torch.tensor(row['rating_norm'], dtype=torch.float32)
        cat = torch.tensor(row['cat_idx'], dtype=torch.long)
        label = torch.tensor(row['label_idx'], dtype=torch.long)
        return img_t, input_ids, attn, rating, cat, label

# Tạo dataset
dataset = MultiModalDataset(df)

# Transform ảnh
mean=[0.485,0.456,0.406]; std=[0.229,0.224,0.225]
transform=transforms.Compose([
    transforms.Resize((224,224)), transforms.ToTensor(), transforms.Normalize(mean,std)
])
# Tokenizer PhoBERT
tokenizer=AutoTokenizer.from_pretrained('vinai/phobert-base')
# Dataset
torch.manual_seed(42)
class MultiModalDataset(Dataset):
    def __init__(self, df):
        self.df=df.reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self,i):
        row=self.df.iloc[i]
        # ảnh
        resp=requests.get(row['url']); img=Image.open(BytesIO(resp.content)).convert('RGB')
        img_t=transform(img)
        # text
        enc=tokenizer(row['name'],padding='max_length',truncation=True,
                       max_length=32,return_tensors='pt')
        input_ids=enc['input_ids'].squeeze(0)
        attn=enc['attention_mask'].squeeze(0)
        # metadata
        rating=torch.tensor(row['rating_norm'],dtype=torch.float32)
        cat=torch.tensor(row['cat_idx'],dtype=torch.long)
        label=torch.tensor(row['label_idx'],dtype=torch.long)
        return img_t, input_ids, attn, rating, cat, label

dataset=MultiModalDataset(df)

## 4. Mô hình đa phương thức

class MultiModalModel(nn.Module):
    def __init__(self, num_categories, embed_dim=8, num_classes=4):
        super().__init__()
        # ResNet50
        w=models.ResNet50_Weights.DEFAULT
        self.resnet=models.resnet50(weights=w)
        self.resnet.fc=nn.Identity()
        # BERT
        self.bert=AutoModel.from_pretrained('vinai/phobert-base')
        # Embedding cho category
        self.cat_emb=nn.Embedding(num_categories, embed_dim)
        # FC layers
        dim_img=2048; dim_txt=768
        self.fc1=nn.Linear(dim_img+dim_txt+embed_dim+1,256)
        self.drop=nn.Dropout(0.3)
        self.fc2=nn.Linear(256,num_classes)
    def forward(self, img, ids, mask, rating, cat):
        img_feat=self.resnet(img)
        txt_out=self.bert(input_ids=ids, attention_mask=mask)
        txt_feat=txt_out.last_hidden_state[:,0,:]
        cat_feat=self.cat_emb(cat)
        r_feat=rating.unsqueeze(1)
        x=torch.cat((img_feat,txt_feat,cat_feat,r_feat),dim=1)
        x=F.relu(self.fc1(x)); x=self.drop(x)
        return self.fc2(x)

## 5. Fine-tuning: huấn luyện với Stratified K-Fold, scheduler và EarlyStopping

def train_fold(train_idx,val_idx,fold):
    # Subset & Dataloader
    train_ds, val_ds = Subset(dataset, train_idx), Subset(dataset, val_idx)
    train_loader=DataLoader(train_ds,batch_size=16,shuffle=True)
    val_loader=DataLoader(val_ds,batch_size=16)
    # Model & device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model=MultiModalModel(num_categories=len(cats)).to(device)
    # Weighted loss
    y_train=[df.iloc[i]['label_idx'] for i in train_idx]
    cw=compute_class_weight('balanced', classes=np.arange(4), y=y_train)
    weights=torch.tensor(cw,dtype=torch.float32).to(device)
    criterion=nn.CrossEntropyLoss(weight=weights)
    # Optimizer & scheduler
    optimizer=optim.AdamW(model.parameters(),lr=2e-5)
    total_steps=len(train_loader)*5
    scheduler=get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps//10,
                                             num_training_steps=total_steps)
    # EarlyStopping params
    best_f1=0; patience=2; wait=0
    for epoch in range(5):
        # train
        model.train();
        for imgs,ids,mask,rating,cat,labels in train_loader:
            imgs,ids,mask,rating,cat,labels=[x.to(device) for x in (imgs,ids,mask,rating,cat,labels)]
            optimizer.zero_grad()
            out=model(imgs,ids,mask,rating,cat)
            loss=criterion(out,labels)
            loss.backward(); optimizer.step(); scheduler.step()
        # validate
        model.eval(); preds=[]; trues=[]
        with torch.no_grad():
            for imgs,ids,mask,rating,cat,labels in val_loader:
                imgs,ids,mask,rating,cat,labels=[x.to(device) for x in (imgs,ids,mask,rating,cat,labels)]
                out=model(imgs,ids,mask,rating,cat)
                pred=out.argmax(dim=1)
                preds.extend(pred.cpu().numpy()); trues.extend(labels.cpu().numpy())
        f1=f1_score(trues,preds,average='macro')
        print(f"Fold {fold} E{epoch} F1-macro: {f1:.4f}")
        # EarlyStopping
        if f1>best_f1: best_f1, wait= f1, 0; best_model=model.state_dict()
        else: wait+=1
        if wait>=patience: break
    # load best
    model.load_state_dict(best_model)
    # final eval
    acc=accuracy_score(trues,preds); f1m=f1_score(trues,preds,average='macro')
    return acc, f1m

# Stratified K-Fold
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
results=[]
for i,(tr,va) in enumerate(skf.split(np.arange(len(dataset)), df['label_idx'])):
    acc,f1=train_fold(tr,va,i+1)
    results.append((acc,f1))
# In kết quả
accs, f1s = zip(*results)
print("Mean Acc:", np.mean(accs), "+/-", np.std(accs))

## 6. Lưu mô hình

torch.save(best_model, 'best_multimodal_model.pth')




label_idx
1    444
3    326
0    252
2    124
Name: count, dtype: int64
Fold 1 E0 F1-macro: 0.3506
Fold 1 E1 F1-macro: 0.3173
Fold 1 E2 F1-macro: 0.3141
Fold 2 E0 F1-macro: 0.3474
Fold 2 E1 F1-macro: 0.3330
Fold 2 E2 F1-macro: 0.3624
Fold 2 E3 F1-macro: 0.3550
Fold 2 E4 F1-macro: 0.3765
Fold 3 E0 F1-macro: 0.2767
Fold 3 E1 F1-macro: 0.2915
Fold 3 E2 F1-macro: 0.2968
Fold 3 E3 F1-macro: 0.3380
Fold 3 E4 F1-macro: 0.3390


KeyboardInterrupt: 