In [2]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score


In [3]:
class HourToXY(BaseEstimator, TransformerMixin):
    """Embed 'hour' -> (x, y) on Oxy, scheme='ml' (chu kỳ 24h)."""
    def __init__(self, period=24):
        self.period = period

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = np.asarray(X)
        h = X if X.ndim == 1 else X[:, 0]
        h = h.astype(float)
        theta = 2*np.pi*h/self.period
        x = np.cos(theta)
        y = np.sin(theta)
        print(x, y)
        return np.c_[x, y] 

    def get_feature_names_out(self, input_features=None):
        base = (input_features[0] if (input_features is not None and len(input_features))
                else "hour")
        print(base)
        return np.array([f"{base}_x", f"{base}_y"])
HourToXY(period=24)

0,1,2
,period,24


In [4]:
num_cols  = ["age", "income"]
cat_cols  = ["city"]
hour_col  = ["hour"]

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

num_pipe

0,1,2
,steps,"[('impute', ...), ('scale', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [5]:

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
cat_pipe


0,1,2
,steps,"[('impute', ...), ('ohe', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [6]:


preprocess = ColumnTransformer(
    transformers=[
        ("hour_xy", HourToXY(period=24), hour_col),     # -> 2 cột hour_x, hour_y
        ("num",     num_pipe,           num_cols),      # scale cột số
        ("cat",     cat_pipe,           cat_cols),      # OHE cột danh mục
    ],
    remainder="drop",
    verbose_feature_names_out=True
)

preprocess

0,1,2
,transformers,"[('hour_xy', ...), ('num', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,period,24

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [7]:
model = LogisticRegression(max_iter=1000)

pipe = Pipeline([
    ("prep", preprocess),  # bước tiền xử lý
    ("clf",  model)        # mô hình 
])
pipe

0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('hour_xy', ...), ('num', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,period,24

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [11]:
# Ví dụ dữ liệu
df = pd.DataFrame({
    "hour":  [6, 18, 0, 9, 21, 14, 3, 12, 7, 16],
    "age":   [25, 40, 33, 29, 51, 36, 27, 42, 24, 31],
    "income":[10.5, 22.0, 14.3, 12.1, 30.0, 18.2, 9.7, 25.4, 11.2, 16.5],
    "city":  ["HCM","HN","DN","HCM","HN","DN","HCM","HN","DN","HCM"],
    "y":     [1,0,0,1,0,0,1,0,1,0]
})
X = df.drop(columns=["y"])
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
print(X_train.shape)
print(getattr(y_train, "shape", None))
print(type(X_train), getattr(X_train, "shape", None))
print(type(y_train), getattr(y_train, "shape", None))


pipe.fit(X_train, y_train)           # <-- FIT: học tham số biến đổi & mô hình trên TRAIN
print("Train acc:", pipe.score(X_train, y_train))

# Dự đoán/đánh giá trên TEST (chỉ transform bằng tham số đã học)
print("Test  acc:", pipe.score(X_test, y_test))



(7, 4)
(7,)
<class 'pandas.core.frame.DataFrame'> (7, 4)
<class 'pandas.core.series.Series'> (7,)
[ 7.07106781e-01 -2.58819045e-01 -1.83697020e-16 -7.07106781e-01
  7.07106781e-01 -5.00000000e-01  1.00000000e+00] [-0.70710678  0.96592583 -1.          0.70710678  0.70710678 -0.8660254
  0.        ]
[ 7.07106781e-01 -2.58819045e-01 -1.83697020e-16 -7.07106781e-01
  7.07106781e-01 -5.00000000e-01  1.00000000e+00] [-0.70710678  0.96592583 -1.          0.70710678  0.70710678 -0.8660254
  0.        ]
Train acc: 1.0
[-1.00000000e+00 -8.66025404e-01  6.12323400e-17] [ 1.2246468e-16 -5.0000000e-01  1.0000000e+00]
Test  acc: 1.0


In [9]:
feat_names = pipe.named_steps["prep"].get_feature_names_out()
print("Features:", feat_names)

hour
Features: ['hour_xy__hour_x' 'hour_xy__hour_y' 'num__age' 'num__income'
 'cat__city_DN' 'cat__city_HCM' 'cat__city_HN']


In [10]:
cv_scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
print("CV mean acc:", cv_scores.mean())
from sklearn.model_selection import GridSearchCV

param_grid = {
    "prep__num__impute__strategy": ["median", "mean"],
    "clf__C": [0.1, 1.0, 5.0]
}
gs = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
gs.fit(X, y)
print(gs.best_params_, gs.best_score_)


[ 1.         -0.70710678  0.70710678 -0.8660254   0.70710678 -1.
 -0.25881905 -0.5       ] [ 0.00000000e+00  7.07106781e-01 -7.07106781e-01 -5.00000000e-01
  7.07106781e-01  1.22464680e-16  9.65925826e-01 -8.66025404e-01]
[ 6.1232340e-17 -1.8369702e-16] [ 1. -1.]
[ 6.12323400e-17 -1.83697020e-16  7.07106781e-01 -8.66025404e-01
  7.07106781e-01 -1.00000000e+00 -2.58819045e-01 -5.00000000e-01] [ 1.00000000e+00 -1.00000000e+00 -7.07106781e-01 -5.00000000e-01
  7.07106781e-01  1.22464680e-16  9.65925826e-01 -8.66025404e-01]
[ 1.         -0.70710678] [0.         0.70710678]
[ 6.12323400e-17 -1.83697020e-16  1.00000000e+00 -7.07106781e-01
 -8.66025404e-01 -1.00000000e+00 -2.58819045e-01 -5.00000000e-01] [ 1.00000000e+00 -1.00000000e+00  0.00000000e+00  7.07106781e-01
 -5.00000000e-01  1.22464680e-16  9.65925826e-01 -8.66025404e-01]
[0.70710678 0.70710678] [-0.70710678  0.70710678]
[ 6.12323400e-17 -1.83697020e-16  1.00000000e+00 -7.07106781e-01
  7.07106781e-01  7.07106781e-01 -1.00000000e+0



[ 6.12323400e-17 -1.83697020e-16  7.07106781e-01 -8.66025404e-01
  7.07106781e-01 -1.00000000e+00 -2.58819045e-01 -5.00000000e-01] [ 1.00000000e+00 -1.00000000e+00 -7.07106781e-01 -5.00000000e-01
  7.07106781e-01  1.22464680e-16  9.65925826e-01 -8.66025404e-01]
[ 1.         -0.70710678] [0.         0.70710678]
[ 6.12323400e-17 -1.83697020e-16  1.00000000e+00 -7.07106781e-01
 -8.66025404e-01 -1.00000000e+00 -2.58819045e-01 -5.00000000e-01] [ 1.00000000e+00 -1.00000000e+00  0.00000000e+00  7.07106781e-01
 -5.00000000e-01  1.22464680e-16  9.65925826e-01 -8.66025404e-01]
[0.70710678 0.70710678] [-0.70710678  0.70710678]
[ 6.12323400e-17 -1.83697020e-16  1.00000000e+00 -7.07106781e-01
  7.07106781e-01  7.07106781e-01 -1.00000000e+00 -5.00000000e-01] [ 1.00000000e+00 -1.00000000e+00  0.00000000e+00  7.07106781e-01
 -7.07106781e-01  7.07106781e-01  1.22464680e-16 -8.66025404e-01]
[-0.8660254  -0.25881905] [-0.5         0.96592583]
[ 6.12323400e-17 -1.83697020e-16  1.00000000e+00 -7.07106781e-