In [30]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon', quiet=True)

True

In [31]:
data_path = r'/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/'
GOLD_FILE = data_path + "gold/v1/gold_1.csv"          # Thay bằng tên file gold của bạn (gold_2.csv, ...)
OUTPUT_FOLDER =data_path + "gold/v1"            
TRAIN_CSV = "train_1.csv"
TEST_CSV = "test_1.csv"

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [32]:
df = pd.read_csv(GOLD_FILE)

In [33]:
print(f"Original shape: {df.shape}")
print(f"Unique books: {df['title'].nunique()} (ước lượng)")

Original shape: (2058, 26)
Unique books: 319 (ước lượng)


In [34]:
df['book_id'] = df['title'].astype(str).str.strip() + "|||" + df['author'].astype(str).str.strip()


In [35]:
sia = SentimentIntensityAnalyzer()
df['review_sentiment'] = df['review_text'].fillna('').astype(str).apply(
    lambda x: sia.polarity_scores(x)['compound']
)
df['review_length'] = df['review_text'].fillna('').astype(str).apply(lambda x: len(x.split()))


In [36]:
target_cols = ['Commercial_success', 'Popular_success', 'Critical_success']
y = df[target_cols]
X_with_id = df.drop(columns=target_cols)

In [37]:
groups = df['book_id']
splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(splitter.split(X_with_id, y, groups=groups))

X_train_full = X_with_id.iloc[train_idx].copy()
X_test_full = X_with_id.iloc[test_idx].copy()
y_train = y.iloc[train_idx].copy()
y_test = y.iloc[test_idx].copy()

print(f"Train samples: {len(X_train_full)} (books: {X_train_full['book_id'].nunique()})")
print(f"Test samples:  {len(X_test_full)} (books: {X_test_full['book_id'].nunique()})")
print(f"No leakage: {len(set(X_train_full['book_id']) & set(X_test_full['book_id'])) == 0}")

X_train = X_train_full.drop(columns=['book_id'])
X_test = X_test_full.drop(columns=['book_id'])

Train samples: 1641 (books: 273)
Test samples:  417 (books: 69)
No leakage: True


In [47]:
def log_transform(X):
    return np.log1p(X)

log_transformer = FunctionTransformer(log_transform)

num_normal_cols = [
    'year', 'publication_year', 'total_weeks', 'best_rank', 'worst_rank', 'mean_rank',
    'debut_rank', 'average_rating', 'rating', 'is_expert',
    'review_sentiment', 'review_length'
]

skew_cols = [
    'Units_Sold', 
    'Gross_Sales', 
    'Sale_Price', 
    'Sales_Rank',     
    'ratings_count',
    'n_votes'
]


author_rating_categories = [['Novice', 'Intermediate', 'Famous', 'Excellent']]

In [48]:
preprocessor = ColumnTransformer([
    ('num_normal', StandardScaler(), num_normal_cols),
    ('num_skew', Pipeline(steps=[
        ('log', log_transformer),
        ('scaler', StandardScaler())
    ]), skew_cols),
    ('author_rating_ordinal', OrdinalEncoder(categories=author_rating_categories), ['Author_Rating']),
    ('genre_onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['Genre']),
    ('desc_tfidf', TfidfVectorizer(max_features=2000, stop_words='english'), 'description'),
    ('review_tfidf', TfidfVectorizer(max_features=3000, stop_words='english', ngram_range=(1,2)), 'review_text')
], remainder='drop', sparse_threshold=0)

In [50]:
print("Đang fit preprocessor trên train...")
X_train_processed = preprocessor.fit_transform(X_train)

print("Đang transform test...")
X_test_processed = preprocessor.transform(X_test)

Đang fit preprocessor trên train...
Đang transform test...


In [51]:
print("Đang tạo tên feature thủ công...")

feature_names = num_normal_cols.copy()
feature_names += [f"log_{col}" for col in skew_cols]
feature_names += ['Author_Rating_ordinal']

Đang tạo tên feature thủ công...


In [52]:
# Genre one-hot
genre_categories = preprocessor.named_transformers_['genre_onehot'].categories_[0]
genre_dropped = preprocessor.named_transformers_['genre_onehot'].drop_idx_
if genre_dropped is not None:
    genre_kept = [cat for i, cat in enumerate(genre_categories) if i != genre_dropped]
else:
    genre_kept = genre_categories[1:]
feature_names += [f"Genre_{cat}" for cat in genre_kept]

In [53]:
desc_vocab = preprocessor.named_transformers_['desc_tfidf'].get_feature_names_out()
review_vocab = preprocessor.named_transformers_['review_tfidf'].get_feature_names_out()

feature_names += [f"desc_{word}" for word in desc_vocab]
feature_names += [f"review_{word}" for word in review_vocab]

In [54]:
print(f"Total features: {len(feature_names)}")
assert len(feature_names) == X_train_processed.shape[1], "Số cột không khớp!"

Total features: 5021


In [57]:
# Train
train_df = pd.DataFrame(
    X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed,
    columns=feature_names,
    index=X_train.index
)
train_df[target_cols] = pd.DataFrame(y_train.values, columns=target_cols, index=y_train.index)

# Test
test_df = pd.DataFrame(
    X_test_processed.toarray() if hasattr(X_test_processed, "toarray") else X_test_processed,
    columns=feature_names,
    index=X_test.index
)
test_df[target_cols] = pd.DataFrame(y_test.values, columns=target_cols, index=y_test.index)

In [58]:
train_path = os.path.join(OUTPUT_FOLDER, TRAIN_CSV)
test_path = os.path.join(OUTPUT_FOLDER, TEST_CSV)

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("\nHoàn tất lưu file!")
print(f"   → {train_path}")
print(f"     Shape: {train_df.shape[0]} rows × {train_df.shape[1]} columns")
print(f"   → {test_path}")
print(f"     Shape: {test_df.shape[0]} rows × {test_df.shape[1]} columns")


Hoàn tất lưu file!
   → /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/gold/v1/train_1.csv
     Shape: 1641 rows × 5024 columns
   → /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/gold/v1/test_1.csv
     Shape: 417 rows × 5024 columns
