In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score,StratifiedKFold,RepeatedStratifiedKFold, cross_validate
from copy import deepcopy

from time import time
from sklearn.preprocessing import FunctionTransformer

In [2]:
def train_model(list_model,X_train,y_train,X_test,y_test,metric,cv,scorer,pipeline):
    df_model = pd.DataFrame(columns = ["model_name","set_data","score","model"])
    set_data = ["test","cv","train"]

    for m in list_model: 
        pipeline_copy = deepcopy(pipeline)
        pipeline_copy.set_params(model = list_model[m])
        spot_check = cross_val_score(pipeline_copy,X_train,y_train,cv = cv,scoring = scorer,n_jobs= -1)
        spot_check = spot_check.mean()

        model = pipeline_copy.fit(X_train,y_train)
        score = metric(y_test,model.predict(X_test),squared = False)
        score_train = metric(y_train,model.predict(X_train),squared = False)
        
        model_list = [m] * 3
        tes = pd.DataFrame(list(zip(model_list,set_data,[score,spot_check,score_train],[model,model,model])),columns = ["model_name","set_data","score","model"])
        print(f"model {m} selesai di training")
        print(f"score test {score}")
        print(f"score cv {spot_check}")
        print(f"score train {score_train}")
        print("=====================================")
        df_model = pd.concat([df_model,tes],ignore_index = True)
        
    return df_model

def rfecv(X, y, pipeline,min_features_to_select=3, cv = 3,step=3,scoring_metric="f1",scoring_decimals=3,random_state=42,groups = None):
    # Initialize survivors and ranked list
    estimator = deepcopy(pipeline)
    estimator.steps.pop(-1)
    survivors = estimator.fit_transform(X,y).columns.tolist()
    ranks = []
    scores = []
    while len(survivors) >= min_features_to_select:
        remove_column_transformer = FunctionTransformer(lambda x: x.drop(ranks, axis=1))
        estimator = deepcopy(pipeline)
        estimator.steps.insert(-1, ('remove_column_transformer', remove_column_transformer))
        print("[%.2f] evaluating %i features ..." % (time(), len(survivors)))
        cv_result = cross_validate(estimator, X, y,
                                cv=cv,
                                groups = groups,
                                scoring=scoring_metric,
                                return_estimator=True)
        score = np.mean(cv_result["test_score"])
        if scoring_decimals is None:
            scores.append(score)
        else:
            scores.append(round(score, scoring_decimals))            
        print("[%.2f] ... score %f." % (time(), scores[-1]))
        
        best_estimator = cv_result["estimator"][np.argmax(cv_result["test_score"])]
        if isinstance(best_estimator, Pipeline):
            weights = best_estimator[-1].feature_importances_
        else:
            weights = best_estimator.feature_importances_
        weights = list(np.power(weights, 2))
        for _ in range(max(min(step, len(survivors) - min_features_to_select), 1)):
            idx = np.argmin(weights)
            ranks.insert(0, survivors.pop(idx))
            weights.pop(idx)
    ranks_reverse = list(reversed(ranks))
    last_max_idx = len(scores) - np.argmax(list(reversed(scores))) - 1
    removed_features = set(ranks_reverse[0:last_max_idx * step])
    best_features = [f for f in X.columns if f not in removed_features]
    return best_features, max(scores), ranks, scores

In [3]:
df = pd.read_csv('../../data/processed/data_clean.csv',index_col=0)
df.head()

Unnamed: 0_level_0,Make,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_City,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,CO2_Emissions
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,FOLD,PICKUP TRUCK - STANDARD,3.5,6.0,A6,X,8.64,6.14,7.514791,306.0
2,CHEVO,PICKUP TRUCK - STANDARD,5.3,,A6,E,27.27,30.76,28.84,283.0
3,BMV,SUBCOMPACT,4.4,,M6,Z,17.0,2.03,,329.0
4,KIO,SUV - SMALL,,4.0,AS6,X,6.52,15.9,,270.0
5,BARUSU,MINICOMPACT,3.0,6.0,,Z,20.15,6.0,13.8,193.0


In [4]:
## stratified shuffle
X = df.drop(columns="CO2_Emissions")

y = df['CO2_Emissions']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((109712, 9), (27429, 9), (109712,), (27429,))

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer,TargetEncoder,OneHotEncoder

cat_columns = X_train.select_dtypes(include=['object']).columns.tolist()
num_columns = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
num_columns = num_columns

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant",fill_value=-1)),
    ("scaler", PowerTransformer())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant",fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore",sparse_output = False))
])

preprocessor = ColumnTransformer([
("numerical",num_pipe,num_columns),
("categorical",cat_pipe,cat_columns)
],remainder = "drop",verbose_feature_names_out=False).set_output(transform="pandas")

In [6]:
prep_pipeline = Pipeline([
    ("preprocessor",preprocessor)
])
prep_pipeline.fit_transform(X_train,y_train).head()

Unnamed: 0_level_0,Engine_Size,Cylinders,Fuel_Consumption_City,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Make_ASURA,Make_BARUSU,Make_BMV,Make_CADILUXE,Make_CHEVO,...,Transmission_M5,Transmission_M6,Transmission_M7,Transmission_missing,Fuel_Type_D,Fuel_Type_E,Fuel_Type_N,Fuel_Type_X,Fuel_Type_Z,Fuel_Type_missing
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13536,1.895495,1.562882,2.65171,1.176824,2.057599,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
113916,0.690964,-1.851319,-0.233282,1.557414,0.606105,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15762,0.027124,-0.166484,-0.343532,1.27146,0.676208,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
39814,-0.189079,-0.166484,2.084307,0.552586,-2.253447,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
67554,0.081608,-0.166484,0.013662,0.449044,0.532666,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor

from sklearn.model_selection import KFold


pipeline = Pipeline([
    ("preprocess",preprocessor),
    ("model",None)
])

list_model = {
    "lgbm":LGBMRegressor(verbose=-1,random_state=42),
    "xgb":XGBRegressor(verbosity=0,random_state=42,tree_method="gpu_hist",gpu_id=1),
    'catboost' : CatBoostRegressor(verbose=0,random_state=42,n_estimators=200),
    'stc' : StackingRegressor(estimators=[('lgbm', LGBMRegressor(verbose=-1,random_state=42)),
                                           ('xgb', XGBRegressor(verbosity=0,random_state=42,tree_method="gpu_hist",gpu_id=1)),
                                           ('catboost', CatBoostRegressor(verbose=0,random_state=42,n_estimators=200))],
                               final_estimator=LGBMRegressor(verbose=-1,random_state=42)),
    'voting' : VotingRegressor(estimators=[('lgbm', LGBMRegressor(verbose=-1,random_state=42)),
                                           ('xgb', XGBRegressor(verbosity=0,random_state=42,tree_method="gpu_hist",gpu_id=1)),
                                           ('catboost', CatBoostRegressor(verbose=0,random_state=42,n_estimators=200))])
}
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer


cv = KFold(n_splits=3,shuffle=True,random_state=42)
df_model = train_model(list_model,X_train,y_train,X_test,y_test,mean_squared_error,cv,'neg_root_mean_squared_error',pipeline)

model lgbm selesai di training
score test 50.96816767352168
score cv -49.130649145143934
score train 48.50518753092247


  df_model = pd.concat([df_model,tes],ignore_index = True)


model xgb selesai di training
score test 50.06805278299131
score cv -48.86565510220296
score train 45.266758490752814
model catboost selesai di training
score test 49.64853168829076
score cv -48.47036514780103
score train 46.88798356787975
model stc selesai di training
score test 49.41748805550373
score cv -48.247481269523036
score train 46.51921545221389
model voting selesai di training
score test 49.813512055658194
score cv -48.44459493956297
score train 46.535787640157295


In [9]:
import optuna 

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "tree_method": "gpu_hist",
        "gpu_id": 1,
        "verbosity": 0,
        "random_state":42
    }
    model = XGBRegressor(**params)
    pipeline.set_params(model = model)
    score = cross_val_score(pipeline,X_train,y_train,cv = cv,scoring = 'neg_root_mean_squared_error',n_jobs= -1)
    score = score.mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
study.best_params

[I 2023-12-09 17:44:06,306] A new study created in memory with name: no-name-2d1f8b1b-5e1d-44f4-ac4a-8f0d3485090d
[I 2023-12-09 17:44:25,169] Trial 0 finished with value: -48.23086239234372 and parameters: {'n_estimators': 510, 'max_depth': 9, 'learning_rate': 0.025400101139197398, 'subsample': 0.9510147126668833, 'colsample_bytree': 0.6988678491648668, 'min_child_weight': 9, 'reg_alpha': 0.37040936830990745, 'reg_lambda': 0.7112027199158223, 'gamma': 0.1962723962873616}. Best is trial 0 with value: -48.23086239234372.
[I 2023-12-09 17:44:29,924] Trial 1 finished with value: -52.578718431621695 and parameters: {'n_estimators': 234, 'max_depth': 3, 'learning_rate': 0.038672605661881854, 'subsample': 0.8241644883557868, 'colsample_bytree': 0.7506458777967808, 'min_child_weight': 7, 'reg_alpha': 0.640253723210617, 'reg_lambda': 0.451942856121075, 'gamma': 0.692382900427864}. Best is trial 0 with value: -48.23086239234372.
[I 2023-12-09 17:44:38,958] Trial 2 finished with value: -48.306105

{'n_estimators': 795,
 'max_depth': 7,
 'learning_rate': 0.045161738574672705,
 'subsample': 0.9038318346188969,
 'colsample_bytree': 0.5418690146576922,
 'min_child_weight': 10,
 'reg_alpha': 0.9067210719920366,
 'reg_lambda': 0.9997454365647964,
 'gamma': 0.6300980868809254}

In [10]:
#fit the model with best params
pipeline.set_params(model = XGBRegressor(**study.best_params))
pipeline.fit(X_train,y_train)

#save the model
import joblib
joblib.dump(pipeline,'../../artifact/model/xgboost_baseline_9des.pkl')

['../../artifact/model/xgboost_baseline_9des.pkl']