In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold, cross_val_score

In [2]:
df = pd.read_csv('df.csv')
train = pd.read_csv('new_train.csv')
test = pd.read_csv('new_test.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,2.885846,5.831328,19.212182,0.730463,0.730463,1.540963,0.0,2.440268,1.820334,...,0,0,0,1,0,0,0,0,1,0
1,1,2.055642,6.221214,19.712205,0.730463,0.730463,1.540963,0.0,2.259674,2.440268,...,0,0,0,1,0,0,0,0,1,0
2,2,2.885846,5.91494,20.347241,0.730463,0.730463,0.0,0.0,2.440268,1.820334,...,0,0,0,1,0,0,0,0,1,0
3,3,3.01134,5.684507,19.691553,0.730463,0.730463,0.0,0.0,2.440268,1.820334,...,0,0,0,1,1,0,0,0,0,0
4,4,2.885846,6.314735,21.32516,0.730463,0.730463,0.0,0.0,2.602594,1.820334,...,0,0,0,1,0,0,0,0,1,0


In [4]:
#Validation function
n_folds = 5
y_train = df.SalePrice.values

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

# Прежняя модель 

In [5]:
# XGBRegressor
from xgboost import XGBRegressor
model_xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [6]:
score = rmsle_cv(model_xgb)
print("XGBRegressor score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

XGBRegressor score: 0.4399 (0.0125)



### Пример подбора параметров

In [30]:
params = {'n_estimators':[3500,4000,4200,4500], 'n_jobs':[-1000,-200], 'random_state':[-500,-20]}

In [31]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(model_xgb, params)
grid.fit(train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='dart',
                                    colsample_bylevel=1, colsample_bynode=0,
                                    colsample_bytree=0, gamma=0.05,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=1, max_depth=0,
                                    min_child_weight=0, missing=None,
                                    n_estimators=2200, n_jobs=1, nthread=-1,
                                    objective='reg:linear', random_state=7,
                                    reg_alpha=0.17, reg_lambda=0.31,
                                    scale_pos_weight=0, seed=2, silent=1,
                                    subsample=0.38, verbosity=0),
             iid='warn', n_jobs=None,
             param_grid={'n_estimators': [3500, 4000, 4200, 4500],
                         'n_jobs': [-1000, -200

In [32]:
print (grid.best_score_)
print (grid.best_params_)

-0.0005796413186878572
{'n_estimators': 4000, 'n_jobs': -1000, 'random_state': -500}


# Новая модель

In [33]:
model_xgb = XGBRegressor(booster='dart', colsample_bytree=0, 
                         colsample_bynode=0, gamma=0.05, 
                         learning_rate=0.1, max_depth=0, seed=2,
                         scale_pos_weight=0,  max_delta_step=1,
                         min_child_weight=0, n_estimators=4000, 
                         reg_alpha=0.17, reg_lambda=0.31,n_jobs=-1000,
                         subsample=0.38, silent=1, verbosity=0,
                         random_state =-500,nthread = -1)


In [34]:
score = rmsle_cv(model_xgb)
print("XGBRegressor score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

XGBRegressor score: 0.3992 (0.0160)

