In [5]:
import pandas as pd
pd.set_option("display.max_columns",None)

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from mrmr import mrmr_regression
from xgboost import XGBRegressor

def dataPreparation(df):
    X  = df.drop("MEDV",axis=1)
    y  = np.log(df["MEDV"])

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
    feature_list = mrmr_regression(X_train,y_train,K=8)
    print(f"Feature yang berpengaruh terhadap MEDV :{','.join(feature_list)}")
    X_train  = X_train[feature_list]
    X_test   = X_test[feature_list]
    return X_train, X_test, y_train, y_test   

def processingPipeline():
    pipeline = Pipeline(steps=[
        ('preprocessor', StandardScaler()),
        ('regressor', XGBRegressor(random_state=42))
    ])

    return pipeline

def randomizedSearch(pipeline, X_train, y_train):
   
    param_distributions = {
        'regressor__max_depth': [3, 4, 5, 6,7,8,9,10],
        'regressor__learning_rate': [0.001, 0.01, 0.1],
    }

   
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=10,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42
    )

  
    random_search.fit(X_train, y_train)

    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Best Score: {random_search.best_score_}")

    return random_search.best_estimator_

def eval(model,X_train,X_test,y_train,y_test):
    pred_train = model.predict(X_train)
    pred_test  = model.predict(X_test)

    print(f"R2 score Train -> {r2_score(y_train,pred_train)}")
    print(f"RMSE Train -> {mean_squared_error(np.exp(y_train),np.exp(pred_train))}\n")
    print(f"R2 score test -> {r2_score(y_test,pred_test)}")
    print(f"RMSE Train -> {mean_squared_error(np.exp(y_test),np.exp(pred_test))}")

In [6]:
df = pd.read_csv("boston.csv")
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [7]:
X_train, X_test, y_train, y_test = dataPreparation(df)

100%|██████████| 8/8 [00:02<00:00,  3.85it/s]

Feature yang berpengaruh terhadap MEDV :LSTAT,PTRATIO,RM,CRIM,TAX,INDUS,NOX,B





In [8]:
pipeline = processingPipeline()
best_model = randomizedSearch(pipeline, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'regressor__max_depth': 3, 'regressor__learning_rate': 0.1}
Best Score: -0.02989010753357576


In [9]:
eval(best_model, X_train, X_test, y_train, y_test)

R2 score Train -> 0.9568555884733914
RMSE Train -> 3.535836477664312

R2 score test -> 0.899481903707269
RMSE Train -> 7.997854119617216


In [10]:
import pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
