In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
data_path = r'/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/gold/'

In [3]:
df = pd.read_csv(data_path + 'v1/gold_1.csv')

In [4]:
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

log_features = ['Units_Sold', 'Gross_Sales', 'Sale_Price', 'Sales_Rank', 'ratings_count', 'n_votes']

for col in log_features:
    df[col] = np.log1p(df[col])

In [5]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=df['title']))

train_df = df.iloc[train_idx].copy()
test_df = df.iloc[test_idx].copy()

In [6]:
numerical_features = [
    'year', 'publication_year', 'total_weeks', 'best_rank', 'worst_rank', 'mean_rank', 
    'debut_rank', 'average_rating', 'rating', 'is_expert'
] + log_features

In [7]:
categorical_onehot = ['Genre']
categorical_ordinal = ['Author_Rating']

ordinal_encoder = OrdinalEncoder(categories=[['Novice', 'Intermediate', 'Excellent', 'Famous']])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_onehot),
        ('cat_ordinal', ordinal_encoder, categorical_ordinal)
    ],
    remainder='passthrough'  # Keep text features and targets
)

In [9]:
X_train = preprocessor.fit_transform(train_df.drop(['Commercial_success', 'Popular_success', 'Critical_success'], axis=1))
X_test = preprocessor.transform(test_df.drop(['Commercial_success', 'Popular_success', 'Critical_success'], axis=1))

In [10]:
num_cols = numerical_features
cat_onehot_cols = preprocessor.named_transformers_['cat_onehot'].get_feature_names_out(categorical_onehot)
cat_ordinal_cols = categorical_ordinal
remainder_cols = [col for col in train_df.columns if col not in numerical_features + categorical_onehot + categorical_ordinal + ['Commercial_success', 'Popular_success', 'Critical_success']]

all_cols = list(num_cols) + list(cat_onehot_cols) + list(cat_ordinal_cols) + remainder_cols

In [11]:
train_df_processed = pd.DataFrame(X_train, columns=all_cols, index=train_df.index)
train_df_processed[['Commercial_success', 'Popular_success', 'Critical_success']] = train_df[['Commercial_success', 'Popular_success', 'Critical_success']]

test_df_processed = pd.DataFrame(X_test, columns=all_cols, index=test_df.index)
test_df_processed[['Commercial_success', 'Popular_success', 'Critical_success']] = test_df[['Commercial_success', 'Popular_success', 'Critical_success']]

In [12]:
train_df_processed.to_csv(data_path + 'v2/train_2.csv', index=False)
test_df_processed.to_csv(data_path + 'v2/test_2.csv', index=False)

In [13]:
train_df_processed.shape

(1645, 27)

In [14]:
train_df_processed.head()

Unnamed: 0,year,publication_year,total_weeks,best_rank,worst_rank,mean_rank,debut_rank,average_rating,rating,is_expert,...,Genre_Fiction,Genre_Nonfiction,Author_Rating,title,author,description,review_text,Commercial_success,Popular_success,Critical_success
1,0.009426,0.549663,-0.724165,1.970348,0.518978,1.851557,1.259357,1.092161,0.291625,-0.769707,...,0.0,0.0,2.0,High Five,Janet Evanovich,What's Stephanie up to now?\r\nHer Uncle Fred ...,"People aren't jumping bail, so Stephanie's wor...",5,1,4
4,0.009426,0.549663,-0.724165,1.970348,0.518978,1.851557,1.259357,1.092161,0.291625,-0.769707,...,0.0,0.0,2.0,High Five,Janet Evanovich,What's Stephanie up to now?\r\nHer Uncle Fred ...,"People aren't jumping bail, so Stephanie's wor...",5,1,3
6,-0.332604,-3.273354,-0.472687,0.320991,0.518978,0.044592,-0.75238,-3.378736,0.291625,1.299195,...,0.0,0.0,2.0,The Fires Of Heaven,Robert Jordan,"When Imelda opened her antique shop, Charles W...","Interesting tale, and I learned a lot about an...",4,3,2
8,-0.332604,-3.273354,-0.472687,0.320991,0.518978,0.044592,-0.75238,-3.378736,0.291625,1.299195,...,0.0,0.0,2.0,The Fires Of Heaven,Robert Jordan,"When Imelda opened her antique shop, Charles W...","Interesting tale, and I learned a lot about an...",4,3,2
9,-0.788644,-0.38026,0.596094,-0.857121,0.518978,-0.269663,-1.646485,-0.232549,-0.480876,-0.769707,...,0.0,0.0,1.0,Lucky,Jackie Collins,In a memoir hailed for its searing candor and ...,"A little too dry for me, a little too factual,...",3,4,3


In [15]:
train_df_processed.columns

Index(['year', 'publication_year', 'total_weeks', 'best_rank', 'worst_rank',
       'mean_rank', 'debut_rank', 'average_rating', 'rating', 'is_expert',
       'Units_Sold', 'Gross_Sales', 'Sale_Price', 'Sales_Rank',
       'ratings_count', 'n_votes', 'Genre_Children', 'Genre_Fiction',
       'Genre_Nonfiction', 'Author_Rating', 'title', 'author', 'description',
       'review_text', 'Commercial_success', 'Popular_success',
       'Critical_success'],
      dtype='object')