In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import graphviz

## Importing Data

In [2]:
data = pd.read_csv("../datasets/ames_processed.csv")
data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,Remodeled,GrLivArea,BsmtFullBath,BsmtHalfBath,...,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,PavedDrive_P,PavedDrive_Y,SalePrice
0,60,65.0,8450,7,5,2003,0,1710,1,0,...,0,0,0,0,1,0,0,0,1,208500
1,20,80.0,9600,6,8,1976,0,1262,0,1,...,0,1,0,0,0,0,0,0,1,181500
2,60,68.0,11250,7,5,2001,1,1786,1,0,...,0,0,0,0,1,0,0,0,1,223500
3,70,60.0,9550,7,5,1915,1,1717,1,0,...,0,0,0,0,1,0,0,0,1,140000
4,60,84.0,14260,8,5,2000,0,2198,1,0,...,0,0,0,0,1,0,0,0,1,250000


In [3]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]

## Tuning Boost Rounds

In [4]:
# Dmatrix oluştur
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Parametrelerin belirlenmesi
params = {"objective":"reg:squarederror"}

# Denenmek üzere boosting round parametrelerinin belirlenmesi
num_rounds = [5, 10, 15]
final_rmse_per_round = []

# Verilen num_rounds değerlendinde modeli dene
for each_num_rounds in num_rounds:
    
    # cross validation 
    cv_results = xgb.cv(dtrain=housing_dmatrix, 
                                params=params, 
                                nfold=3, 
                                num_boost_round=each_num_rounds, 
                                metrics="rmse", 
                                as_pandas=True, seed=123)
    
    # Elde edilen değeri ekle
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])
    
# Dataframe üzerinde göster
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))

print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds", "rmse"]))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


   num_boosting_rounds          rmse
0                    5  50255.430990
1                   10  33649.873698
2                   15  32310.655599


## Early Stopping

In [5]:
# cross validation uygula
cv_results = xgb.cv(dtrain=housing_dmatrix, 
                            params=params, 
                            nfold=3, 
                            early_stopping_rounds = 3, 
                            num_boost_round=50, 
                            metrics="rmse", 
                            as_pandas=True, seed=123)
print(cv_results)

    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0     141760.250000      417.063866   142563.786458     767.913074
1     102689.510417      119.365058   104973.697917     555.198377
2      75277.820313       94.420467    79158.578125     660.994649
3      56039.442708      256.405207    61316.019531     978.407785
4      42472.858073      292.832123    50255.430990    1647.820732
5      33088.195964      296.927749    43090.341146    1564.794478
6      26456.964844      239.914145    38370.485677    1657.281064
7      21774.842448      213.561776    35764.682292    1518.867810
8      18601.770182      166.787600    34405.319010    1500.544670
9      16316.602865      123.658397    33649.873698    1080.104688
10     14794.597005      195.186413    33088.815104     968.454480
11     13700.329101      167.398953    32754.619141     886.383157
12     12839.569336      194.174784    32583.311198     804.897372
13     12238.123698      209.772627    32355.342448     759.82

## Tuning Learning Rate (eta)

In [6]:
# dmatrix oluştur
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# parametrelerin belirlenmesi
params = {"objective":"reg:squarederror"}

# learning rate parametrelerinin belirlenmesi
learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8]
best_rmse = []

for each_rate in learning_rate:

    params["eta"] = each_rate
    
    # cross validation uygula
    cv_results = xgb.cv(dtrain=housing_dmatrix, 
                        nfold=3,
                        params=params,
                        num_boost_round=16,
                        metrics="rmse",
                        seed=123,
                        as_pandas=True)
    
    
    # hata değerlerini ekle
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

print(pd.DataFrame(list(zip(learning_rate, best_rmse)), columns=["eta","best_rmse"]))

     eta      best_rmse
0  0.001  194631.526042
1  0.010  170108.604167
2  0.100   51658.468750
3  0.200   32967.347656
4  0.300   32212.396485
5  0.400   34148.996745
6  0.500   34020.478516
7  0.800   36716.585938


## Tuning Max Depth

In [7]:
# parametrelerin belirlenmesi
params = {"objective":"reg:squarederror", "eta":0.3}

# max_depth için denenecek değerler
max_depths = [2, 5, 6, 10, 20]
best_rmse = []

for each_depth in max_depths:

    params["max_depth"] = each_depth
    
    # cross validation uygula
    cv_results = xgb.cv(params=params,
                        dtrain=housing_dmatrix,
                        nfold=3,
                        metrics="rmse",
                        num_boost_round=16,
                        as_pandas=True,
                        seed=123)
    
    # değerleri ekle
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])
    
print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

   max_depth     best_rmse
0          2  35089.841146
1          5  31822.169271
2          6  32212.396485
3         10  33150.076172
4         20  33911.895833


## Colsample bytree

In [8]:
# parametrelerin belirlenmesi
params={"objective":"reg:squarederror", "max_depth":5}


colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

for curr_val in colsample_bytree_vals:

    params["colsample_bytree"] = curr_val
    
    cv_results = xgb.cv(dtrain=housing_dmatrix, 
                        params=params, 
                        nfold=3,
                        num_boost_round=16,
                        metrics="rmse", 
                        as_pandas=True, seed=123)
    
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree", "best_rmse"]))

   colsample_bytree     best_rmse
0               0.1  34665.914063
1               0.5  29662.524739
2               0.8  32316.436849
3               1.0  31822.169271


## Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
housing_dmatrix = xgb.DMatrix(data=X, label=y)

gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

gbm = xgb.XGBRegressor(objective="reg:squarederror")

grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                       scoring='neg_mean_squared_error', cv=4, verbose=1)
grid_mse.fit(X, y)


print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


Best parameters found:  {'colsample_bytree': 0.3, 'max_depth': 5, 'n_estimators': 50}
Lowest RMSE found:  29307.853447661928


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    2.7s finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


## Random Search

In [11]:
from sklearn.model_selection import RandomizedSearchCV

In [12]:
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}


gbm = xgb.XGBRegressor(n_estimators=10, objective="reg:squarederror")

randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm, scoring="neg_mean_squared_error", n_iter=5, cv=4, verbose=1)


randomized_mse.fit(X, y)


print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
 

Best parameters found:  {'n_estimators': 25, 'max_depth': 5}
Lowest RMSE found:  36636.35808132903
