In [None]:
import numpy as np
import pandas as pd

from sklearn import model_selection, preprocessing
import xgboost as xgb

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

id_test = test.id

y_train = train["price_doc"] * .969 + 10
x_train = train.drop(["id", "timestamp","price_doc"], axis=1)

x_test = test.drop(["id", "timestamp"], axis=1)
x_all = pd.concat([x_train,x_test])


########### clean data
x_all['build_year'] = x_all['build_year'].map(lambda x: x if x < 2018 else np.nan )
x_all['build_year'] = x_all['build_year'].map(lambda x: x if x > 1800 else np.nan )

x_all['kitch_sq'] = x_all['kitch_sq'].map(lambda x: x if x < 1000 else np.nan )
x_all['full_sq'] = x_all['full_sq'].map(lambda x: x if x > 9 else np.nan)
x_all['life_sq'] = x_all['life_sq'].map(lambda x: x if x > 0 else np.nan)

#adding some nan
def kitch_1 (full_sq, life_sq, kitch_sq):
    if (kitch_sq == full_sq) or (kitch_sq>500):
        return np.nan
    elif (kitch_sq < 3.0):
        return np.nan
    else:
        return kitch_sq

def life_1 (full_sq, life_sq, kitch_sq):
    if (life_sq >full_sq) :
        return full_sq
    elif (life_sq<10.0):
        return np.nan
    else:
        return life_sq

def max_floor_1 (floor, max_floor):
    if (floor >max_floor) :
        return np.nan
    else:
        return max_floor

def full_sq_1 (full_sq, life_sq, kitch_sq):
    if (full_sq >1000) :
        return full_sq/100
    elif (full_sq >300) and full_sq/life_sq > 10:
        return full_sq/10
    else:
        return full_sq

    
x_all['kitch_sq'] = x_all.apply(lambda row: kitch_1( row.full_sq, row.life_sq, row.kitch_sq ), axis=1 )
x_all['life_sq'] = x_all.apply(lambda row: life_1( row.full_sq, row.life_sq, row.kitch_sq ), axis=1 )
x_all['max_floor'] = x_all.apply(lambda row: max_floor_1( row.floor, row.max_floor ), axis=1 )
x_all['full_sq'] = x_all.apply(lambda row: full_sq_1( row.full_sq, row.life_sq, row.kitch_sq ), axis=1 )



In [None]:
for c in x_all.columns:
    if x_all[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_all[c].values)) 
        x_all[c] = lbl.transform(list(x_all[c].values))

num_train = len(y_train)
x_train = x_all[:num_train]
print('x_train:', x_train.shape)
x_test = x_all[num_train:]
print('x_test:', x_test.shape)

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}


dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False)

num_boost_rounds = len(cv_output)
print('num_boost_rounds:',num_boost_rounds)
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round= num_boost_rounds)


y_predict = model.predict(dtest)
output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})

output.to_csv('xgbSub.csv', index=False)
print('done!' )