In [18]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import torch
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon', quiet=True)

True

In [19]:
GOLD_FILE = r"/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/gold/v1/gold_1.csv"
OUTPUT_FOLDER = r"/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/gold/v2"
TRAIN_CSV = "train_2.csv"
TEST_CSV = "test_2.csv"

In [20]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = AutoModel.from_pretrained('distilbert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
bert_model.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [21]:
def mean_pooling(last_hidden_state, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    return torch.sum(last_hidden_state * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [22]:
def get_bert_embeddings(texts, batch_size=8, max_length=256):
    embeddings = []
    texts = texts.fillna('').astype(str).tolist()
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = bert_model(**inputs)
        batch_emb = mean_pooling(outputs.last_hidden_state, inputs['attention_mask']).cpu().numpy()
        embeddings.append(batch_emb)
        if (i // batch_size) % 10 == 0:
            print(f"   Processed {i + len(batch)}/{len(texts)}")
    return np.vstack(embeddings)

In [None]:
print("Đang load gold_1.csv...")
df = pd.read_csv(GOLD_FILE)
print(f"Original shape: {df.shape}")

# Book ID
df['book_id'] = df['title'].astype(str).str.strip() + "|||" + df['author'].astype(str).str.strip()

# Sentiment & length
sia = SentimentIntensityAnalyzer()
df['review_sentiment'] = df['review_text'].fillna('').astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])
df['review_length'] = df['review_text'].fillna('').astype(str).apply(lambda x: len(x.split()))

# BERT embeddings
print("\nTính BERT embedding cho review_text...")
review_emb = get_bert_embeddings(df['review_text'])
print("Tính BERT embedding cho description...")
desc_emb = get_bert_embeddings(df['description'])

# Chuyển embedding thành nhiều cột
for i in range(review_emb.shape[1]):
    df[f"review_bert_{i}"] = review_emb[:, i]

for i in range(desc_emb.shape[1]):
    df[f"desc_bert_{i}"] = desc_emb[:, i]

# Targets
target_cols = ['Commercial_success', 'Popular_success', 'Critical_success']
y = df[target_cols]

# Features with book_id
X_with_id = df.drop(columns=target_cols)

Đang load gold_1.csv...
Original shape: (2058, 26)

Tính BERT embedding cho review_text...
   Processed 8/2058
   Processed 88/2058
   Processed 168/2058
   Processed 248/2058
   Processed 328/2058
   Processed 408/2058
   Processed 488/2058
   Processed 568/2058
   Processed 648/2058
   Processed 728/2058
   Processed 808/2058
   Processed 888/2058
   Processed 968/2058
   Processed 1048/2058
   Processed 1128/2058
   Processed 1208/2058
   Processed 1288/2058
   Processed 1368/2058
   Processed 1448/2058
   Processed 1528/2058
   Processed 1608/2058
   Processed 1688/2058
   Processed 1768/2058
   Processed 1848/2058
   Processed 1928/2058
   Processed 2008/2058
Tính BERT embedding cho description...
   Processed 8/2058
   Processed 88/2058
   Processed 168/2058
   Processed 248/2058
   Processed 328/2058
   Processed 408/2058
   Processed 488/2058
   Processed 568/2058
   Processed 648/2058
   Processed 728/2058
   Processed 808/2058
   Processed 888/2058
   Processed 968/2058
   Pr

  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = review_emb[:, i]
  df[f"review_bert_{i}"] = revie

In [24]:
print("\nGroup split...")
groups = df['book_id']
splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(splitter.split(X_with_id, y, groups=groups))


Group split...


In [25]:
X_train_full = X_with_id.iloc[train_idx].copy()
X_test_full = X_with_id.iloc[test_idx].copy()
y_train = y.iloc[train_idx].copy()
y_test = y.iloc[test_idx].copy()

In [26]:
X_train = X_train_full.drop(columns=['book_id'])
X_test = X_test_full.drop(columns=['book_id'])

In [27]:
def log_transform(X):
    return np.log1p(X)

log_transformer = FunctionTransformer(log_transform)

In [28]:
num_normal_cols = [
    'year', 'publication_year', 'total_weeks', 'best_rank', 'worst_rank', 'mean_rank',
    'debut_rank', 'average_rating', 'rating', 'is_expert',
    'review_sentiment', 'review_length'
]

# Skew columns → log transform
skew_cols = [
    'Units_Sold', 'Gross_Sales', 'Sale_Price', 'Sales_Rank',
    'ratings_count', 'n_votes'
]

In [29]:
preprocessor = ColumnTransformer([
    ('num_normal', StandardScaler(), num_normal_cols),
    ('num_skew', Pipeline([('log', log_transformer), ('scaler', StandardScaler())]), skew_cols)
], remainder='passthrough')

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [31]:
feature_names = num_normal_cols.copy()
feature_names += [f"log_{c}" for c in skew_cols]
feature_names += [f"review_bert_{i}" for i in range(768)]
feature_names += [f"desc_bert_{i}" for i in range(768)]

In [32]:
train_df = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train.index)
train_df[target_cols] = y_train.values

test_df = pd.DataFrame(X_test_processed, columns=feature_names, index=X_test.index)
test_df[target_cols] = y_test.values

train_path = os.path.join(OUTPUT_FOLDER, TRAIN_CSV)
test_path = os.path.join(OUTPUT_FOLDER, TEST_CSV)

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("\nHOÀN TẤT!")
print(f"→ {train_path} ({train_df.shape})")
print(f"→ {test_path} ({test_df.shape})")
print("Chạy train_bert_model.py để train XGBoost trên BERT data.")

ValueError: Shape of passed values is (1641, 1561), indices imply (1641, 1554)