In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import  OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from tqdm import tqdm

In [3]:
housing = pd.read_csv("C:/Python/Datasets/Housing.csv")
y = housing['price']
X = housing.drop('price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [None]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
trns = make_column_transformer((ohe, make_column_selector(dtype_include=object) ), 
                               remainder='passthrough', verbose_feature_names_out=False)
trns = trns.set_output(transform='pandas')
X_train_trns = trns.fit_transform(X_train)
X_test_trns = trns.transform(X_test)

XGBoost

In [7]:
rates = np.linspace(0.01, 0.8, 20)
n_est = [50, 100, 200]
depths = [3, 5, None]
scores = []
for r in tqdm(rates):
    for n in n_est:
        for d in depths:
            gbm = XGBRegressor(random_state=25,n_estimators=n,
                                             max_depth=d, learning_rate=r)
            gbm.fit(X_train_trns, y_train)
            y_pred_prob = gbm.predict(X_test_trns)
            scores.append( [r, n, d, r2_score(y_test, y_pred_prob)] )
df_scores = pd.DataFrame( scores, columns=['rate','n_est','depth','score'] )
df_scores.sort_values( 'score', ascending=False )  

100%|██████████| 20/20 [00:13<00:00,  1.48it/s]


Unnamed: 0,rate,n_est,depth,score
27,0.134737,50,3.0,0.617014
12,0.051579,100,3.0,0.615300
18,0.093158,50,3.0,0.611801
15,0.051579,200,3.0,0.611479
54,0.259474,50,3.0,0.610581
...,...,...,...,...
161,0.716842,200,,0.402892
158,0.716842,100,,0.402768
1,0.010000,50,5.0,0.340971
2,0.010000,50,,0.331190


Light GBM

In [16]:
rates = np.linspace(0.01, 0.8, 20)
n_est = [50, 100, 200]
depths = [3, 5, None]
scores = []
for r in tqdm(rates):
    for n in n_est:
        for d in depths:
            gbm = LGBMRegressor(random_state=25,n_estimators=n,verbose=-1,
                                             max_depth=d, learning_rate=r)
            gbm.fit(X_train_trns, y_train)
            y_pred = gbm.predict(X_test_trns)
            scores.append( [r, n, d, r2_score(y_test, y_pred)] )
df_scores = pd.DataFrame( scores, columns=['rate','n_est','depth','score'] )
df_scores.sort_values( 'score', ascending=False )  

100%|██████████| 20/20 [00:04<00:00,  4.22it/s]


Unnamed: 0,rate,n_est,depth,score
14,0.051579,100,,0.645855
13,0.051579,100,5.0,0.641495
19,0.093158,50,5.0,0.638120
20,0.093158,50,,0.638059
12,0.051579,100,3.0,0.631664
...,...,...,...,...
168,0.758421,200,3.0,0.437423
179,0.800000,200,,0.400612
2,0.010000,50,,0.340341
1,0.010000,50,5.0,0.337951


Cat Boost

In [17]:
rates = np.linspace(0.01, 0.8, 20)
n_est = [50, 100, 200]
depths = [3, 5, None]
scores = []
for r in tqdm(rates):
    for n in n_est:
        for d in depths:
            gbm = CatBoostRegressor(random_state=25,n_estimators=n,verbose=0,
                                             max_depth=d, learning_rate=r)
            gbm.fit(X_train_trns, y_train)
            y_pred = gbm.predict(X_test_trns)
            scores.append( [r, n, d, r2_score(y_test, y_pred)] )
df_scores = pd.DataFrame( scores, columns=['rate','n_est','depth','score'] )
df_scores.sort_values( 'score', ascending=False )  

100%|██████████| 20/20 [00:27<00:00,  1.38s/it]


Unnamed: 0,rate,n_est,depth,score
55,0.259474,50,5.0,0.642308
73,0.342632,50,5.0,0.642011
76,0.342632,100,5.0,0.638129
31,0.134737,100,5.0,0.638059
28,0.134737,50,5.0,0.635771
...,...,...,...,...
3,0.010000,100,3.0,0.414968
177,0.800000,200,3.0,0.414925
2,0.010000,50,,0.286049
1,0.010000,50,5.0,0.282310


Cat Boost without One Hot Encoding

In [19]:
X_train.head()

Unnamed: 0,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
236,3630,4,1,2,yes,no,no,no,no,3,no
238,3000,2,1,1,yes,no,yes,no,no,2,no
392,5020,3,1,4,yes,no,no,no,yes,0,yes
395,7200,3,1,2,yes,yes,yes,no,no,1,yes
36,4032,2,1,1,yes,no,yes,no,no,0,no


In [18]:
cat_features = list( X.columns[X.dtypes==object] )
gbm = CatBoostRegressor(random_state=25,verbose=0)
gbm.fit(X_train, y_train, cat_features=cat_features)
y_pred = gbm.predict(X_test)
r2_score(y_test, y_pred)

0.6158439632175752

In [20]:
for r in tqdm(rates):
    for n in n_est:
        for d in depths:
            gbm = CatBoostRegressor(random_state=25,n_estimators=n,verbose=0,
                                             max_depth=d, learning_rate=r)
            gbm.fit(X_train, y_train, cat_features=cat_features)
            y_pred = gbm.predict(X_test)
            scores.append( [r, n, d, r2_score(y_test, y_pred)] )
df_scores = pd.DataFrame( scores, columns=['rate','n_est','depth','score'] )
df_scores.sort_values( 'score', ascending=False )  

100%|██████████| 20/20 [00:26<00:00,  1.35s/it]


Unnamed: 0,rate,n_est,depth,score
55,0.259474,50,5.0,0.642308
73,0.342632,50,5.0,0.642011
212,0.134737,100,,0.638831
76,0.342632,100,5.0,0.638129
31,0.134737,100,5.0,0.638059
...,...,...,...,...
181,0.010000,50,5.0,0.283400
1,0.010000,50,5.0,0.282310
182,0.010000,50,,0.275816
180,0.010000,50,3.0,0.268231
