In [None]:
df_train["full_sq"].median()

# Naive XGB with Imputer

### Modification from https://www.kaggle.com/bguberfain/naive-xgb-lb-0-317 (Thx!)

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
# using most significant feature from macro. ref: https://www.kaggle.com/yitzhakr/moscow-houses-prices-analysis
#macro_cols = ['oil_urals', 'gdp_quart_growth', 'cpi', 'usdrub', \
#                'salary_growth', 'unemployment', 'mortgage_rate', \
#                 'deposits_rate','rent_price_3room_bus']

# From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [None]:
df_train = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)
#df_macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])

df_train.head()

In [None]:
ax = df_train['price_doc'].hist(bins=50)

In [None]:
df_train["full_sq"].median()

## FEATURE ENGINEERING BELOW

In [None]:
# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

#full_sq have the most deterministic factor for scoring. 
#some full_sq have noise value that may need some moderation. NaN have better effect on the result
df_all.loc[df_all['full_sq']>250, 'full_sq'] = np.nan
df_all.loc[df_all['full_sq']<10, 'full_sq'] = np.nan

#some build year have noise value that may need some moderation. NaN have better effect on the result
#df_all.loc[df_all['build_year'] > 2017, 'build_year'] = np.nan

#change floor with 0 value. NaN have better effect on the result
df_all.loc[df_all['floor'] == 0, 'floor'] = np.nan
df_all.loc[df_all['max_floor'] == 0, 'max_floor'] = np.nan

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

## Population feature engineering source : https://www.kaggle.com/philippsp/a-collection-of-new-features

#df_all['young_proportion'] = df_all['young_all']/df_all['full_all'].astype(float)
#df_all['work_proportion'] = df_all['work_all']/df_all['full_all'].astype(float)
#df_all['retire_proportion'] = df_all['ekder_all']/df_all['full_all'].astype(float)

#df_all['ratio_preschool'] = df_all['children_preschool'] / df_all['preschool_quota'].astype(float)
#df_all['ratio_school'] = df_all['children_school'] / df_all['school_quota'].astype(float)

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)

## -- END FEATURE ENGINEERING

In [None]:
df_all

In [None]:
#df_all = df_all.dropna(axis=1,thresh=25000)

In [None]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)

In [None]:
# remove all infinity from feature engineering
from numpy import inf, nan

df_values[df_values == inf] = nan

In [None]:
#from fancyimpute import SimpleFill,SoftImpute

#from sklearn.preprocessing import StandardScaler, RobustScaler

#df_values_temp = pd.DataFrame(SimpleFill(fill_method="median").complete(df_values))

#df_values_temp.columns = df_values.columns
#df_values_temp.index = df_values.index

#df_values = df_values_temp

In [None]:
df_values.head()

In [None]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = X_all[:num_train]
X_train = X_all[:num_train-num_val]
X_val = X_all[num_train-num_val:num_train]
ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = X_all[num_train:]

df_columns = df_values.columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

In [None]:
dtrain_all = xgb.DMatrix(X_train_all, ylog_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'min_child_weight': 5,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'n_estimators' : 250
}

# Uncomment to tune XGB `num_boost_rounds`
partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                       early_stopping_rounds=30, verbose_eval=30)

num_boost_round = partial_model.best_iteration

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 16))
xgb.plot_importance(partial_model, max_num_features=50, height=0.5, ax=ax)

In [None]:
num_boost_round = partial_model.best_iteration

In [None]:
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 16))
xgb.plot_importance(model, max_num_features=50, height=0.5, ax=ax)

In [None]:
#stacking with random forest

#from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor

#ABR = AdaBoostRegressor(base_estimator = RandomForestRegressor(max_depth = 3,n_estimators = 5),
#                        n_estimators = 100,random_state = 777, learning_rate = 0.05)

#ABR.fit(X_train, ylog_train)

In [None]:
##ABR.score(X_val, ylog_val)

In [None]:
#meta_feature1 = model.predict(dtrain)

In [None]:
ylog_pred = model.predict(dtest)
y_pred = np.exp(ylog_pred) - 1

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})

df_sub.to_csv('output.csv', index=False)