In [4]:
#data analysis
import numpy as np
import pandas as pd

#visualization
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

In [5]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import (train_test_split,GridSearchCV,cross_val_predict,cross_val_score)
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from warnings import filterwarnings
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from xgboost import XGBRegressor
import time

In [6]:
df = pd.read_csv("../input/son-deneme/3_preprocessed.csv")

In [None]:
df.info()

In [7]:
df.head()

Unnamed: 0,Price,m2_gross,m2_net,Balcony,Furnished,Available_for_Loan,Swap,Front_West,Front_East,Front_South,...,Number_of_bathrooms_4,Number_of_bathrooms_5,Number_of_bathrooms_6,Using_status_0,Using_status_1,Using_status_2,From_who_0,From_who_1,From_who_2,From_who_3
1,165000,50,40.0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,207000,42,42.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,153000,55,45.0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,205000,62,46.0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,150000,70,55.0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


**We split the data to train and test**

In [8]:
y = df["Price"]
X = df.drop(['Price'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

**We will use XGBoost alghorithm**

In [9]:
xgb = XGBRegressor()

In [7]:
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

#### **RMSE**

In [None]:
y_pred = xgb.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

#### **Score**

In [None]:
xgb.score(X_test,y_test)

### **After Tuning**

In [24]:
xgb_grid = {
     'colsample_bytree': [0.5,0.6,0.8,1], 
     'n_estimators':[1000,2000],
     'max_depth': [4,5,6],
     'learning_rate': [0.05,0.1,0.3,0.5],
    "tree_method":["gpu_hist"]
}

In [25]:
xgb = XGBRegressor()

xgb_cv = GridSearchCV(xgb, 
                      param_grid = xgb_grid, 
                      cv = 5, 
                      n_jobs = -1,
                      verbose = 5,)


xgb_cv.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 58.4min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 63.8min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_para

In [26]:
xgb_cv.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.05,
 'max_depth': 6,
 'n_estimators': 2000,
 'tree_method': 'gpu_hist'}

In [28]:
xgb_tuned = XGBRegressor(colsample_bytree = 0.5, 
                         learning_rate = 0.05, 
                         max_depth = 6, 
                         n_estimators = 2000,
                        tree_method = "gpu_hist") 

xgb_tuned = xgb_tuned.fit(X_train,y_train)

#### **RMSE**

In [29]:
y_pred = xgb_tuned.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

73015.76409807723

#### **Score**

In [30]:
xgb_tuned.score(X_test,y_test)

0.7722175408077631