In [1]:
import catboost
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib

In [324]:
def prepare_x_nan(X_full, X_test):
    for X in (X_full, X_test):
        for col in X.columns[X.isnull().any()]:
            if X.dtypes[col] == 'object':
                X[col] = X[col].fillna('NAN_str')
            else:
                X[col] = X[col].fillna(X[col].median())   
    return X_full, X_test


def find_drop_columns(X_full, X_test):
    X = pd.concat([X_full, X_test], sort = False, ignore_index=True)
    return X.columns[(X.nunique() < 2) | ((X.nunique() < 3) & (X.dtypes != 'object')\
                                          & (len(X) * 0.3 < X.isnull().sum()))]


def preparenan(X_full, X_test):
    drop_col = find_drop_columns(X_full, X_test)
    X_full = X_full.drop(drop_col, axis = 1)
    X_test = X_test.drop(drop_col, axis = 1)
    X_full, X_test = prepare_x_nan(X_full, X_test)
    print(X_full.isnull().sum().sum())
    return X_full, X_test

def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]

def toorder(X_full_nord, X_test_nord, col_name, y):
    order = X_full_nord.groupby(col_name).median()[y]
    orderdf = pd.DataFrame({'name':order.index, y:order.values})
    orderdf = orderdf.sort_values(y)
    orderdf.index = pd.RangeIndex(1, len(orderdf.index) + 1)
    orderdf['label'] = pd.RangeIndex(1, len(orderdf.index) + 1)
    name_dict = {}
    for i in range(1, len(orderdf.index) + 1):
        name_dict[orderdf['name'][i]] = orderdf['label'][i]
    X_full_nord[col_name] = X_full_nord[col_name].map(name_dict)
    X_test_nord[col_name] = X_test_nord[col_name].map(name_dict)
    return X_full_nord, X_test_nord

def extremerep(df, name):
    if df[name].nunique() > 100:
        df.loc[(df[name]  >  df[name].quantile(0.99)) & (df[name]  >  3 * (df[name].quantile(0.75) - df[name].quantile(0.25)) + df[name].quantile(0.50)), name] = df[name].quantile(0.99, interpolation='higher')
        df.loc[(df[name]  <  df[name].quantile(0.01)) & (df[name]  < df[name].quantile(0.50) - 3 * (df[name].quantile(0.75) - df[name].quantile(0.25))), name] = df[name].quantile(0.01, interpolation='lower')

def add_f(X_full_y):
#     X_full_y['YrBltAndRemod'] = X_full_y['YearBuilt'] + X_full_y['YearRemodAdd']
#     X_full_y['TotalSF'] = X_full_y['TotalBsmtSF'] + X_full_y['1stFlrSF'] + X_full_y['2ndFlrSF']

#     X_full_y['Total_sqr_footage'] = (X_full_y['BsmtFinSF1'] + X_full_y['BsmtFinSF2'] +
#                                  X_full_y['1stFlrSF'] + X_full_y['2ndFlrSF'])

#     X_full_y['Total_Bathrooms'] = (X_full_y['FullBath'] + (0.5 * X_full_y['HalfBath']) +
#                                X_full_y['BsmtFullBath'] + (0.5 * X_full_y['BsmtHalfBath']))

#     X_full_y['Total_porch_sf'] = (X_full_y['OpenPorchSF'] + X_full_y['3SsnPorch'] +
#                               X_full_y['EnclosedPorch'] + X_full_y['ScreenPorch'] +
#                               X_full_y['WoodDeckSF'])


    X_full_y['haspool'] = X_full_y['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
#     X_full_y['has2ndfloor'] = X_full_y['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
#     X_full_y['hasgarage'] = X_full_y['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
#     X_full_y['hasbsmt'] = X_full_y['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    X_full_y['hasfireplace'] = X_full_y['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
#     columns = [
#               'BsmtFinSF2',
#               'BsmtHalfBath', '3SsnPorch',
#               'PoolArea']
#     X_full_y = X_full_y.drop(columns = columns)
    return X_full_y

In [351]:
full_data = pd.read_csv("train.csv")
full_data.head()
y_full = full_data['SalePrice']
X_full_nan = full_data
X_test_nan = pd.read_csv("test.csv")

In [352]:

drop_columns = []
plus_drop = ['LotConfig', 'BsmtFinSF2','3SsnPorch','PoolArea', 'BsmtHalfBath',
             'LandSlope', 'RoofStyle', 'Condition1', 'EnclosedPorch' , 'BldgType', 'OverallCond','ExterCond', 'Functional']
drop_columns = [
                'Id', 'Street', 'Utilities', 'Condition2', 'RoofMatl', 'BsmtHalfBath',
                 'Heating', 'LowQualFinSF', 'EnclosedPorch', 'BsmtFinSF2', 
                'MiscFeature', 'MiscVal', 'MoSold', 'YrSold'
                ] 
X_full_nan = X_full_nan.drop(columns = drop_columns)
X_test_nan = X_test_nan.drop(columns = drop_columns)
X_full_nord, X_test_nord = preparenan(X_full_nan, X_test_nan)
print(X_full_nord.isnull().sum().sum())
for col in X_full_nord.columns[X_full_nord.dtypes == 'object']:
    X_full, X_test = toorder(X_full_nord, X_test_nord, col, 'SalePrice')
print(X_full.isnull().sum().sum())
X_full, X_test = toorder(X_full, X_test_nord, 'MSSubClass', 'SalePrice')
X_full = X_full.drop(['SalePrice'], axis = 1)
# X_full = add_f(X_full)
# X_test = add_f(X_test)
for col in X_full.columns:
    extremerep(X_full, col)
    extremerep(X_test, col)
cat_features = X_full.columns[X_full.nunique() < 5]
cat_features_inds = column_index(X_full, cat_features)

0
0
0


In [426]:
for col in X_test.columns[X_test.isnull().any()]:
        if X_test.dtypes[col] == 'object':
            X_test[col] = X_test[col].fillna('NAN_str')
        else:
            X_test[col] = X_test[col].fillna(X_test[col].median())

In [428]:
X_test.isnull().sum().sum()

0

In [432]:
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
y_full_log = np.log(y_full)
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full_log, test_size = 0.2, random_state = 42)
train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features_inds)

eval_dataset = Pool(data=X_val,
                    label=y_val,
                    cat_features=cat_features_inds)
full_dataset = Pool(data=X_full,
                    label=y_full_log,
                    cat_features=cat_features_inds)

test_dataset = Pool(data=X_test,
                    cat_features=cat_features_inds)



In [452]:
#0.13 897 RMSE in test
#no add 0.11239 in val bootstrap_type = 'Bernoulli', subsample = 1,  iterations = 40000, one_hot_max_size = 10, random_strength = 1, l2_leaf_reg = 25, random_state = 42, depth = 2, loss_function='MAE', custom_metric = 'RMSE'
model = CatBoostRegressor( bootstrap_type = 'Bernoulli', subsample = 1,  iterations = 10000, one_hot_max_size = 10, random_strength = 1.0, l2_leaf_reg = 25, depth = 2, loss_function='MAE', custom_metric = 'RMSE')

model.fit(full_dataset,
          verbose = False, plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x27ef20eb780>

In [461]:
#0.14 740 RMSE in test
#0.121441 in cv - 20 folds, parameters by cv - min lr
model = CatBoostRegressor(bootstrap_type = 'Bernoulli', subsample = 1, learning_rate = 0.001, iterations = 43285, one_hot_max_size = 15, random_strength = 1, l2_leaf_reg = 25, random_state = 42, depth = 2, loss_function='MAE', custom_metric = 'RMSE')

model.fit(full_dataset, verbose = False, plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x27ef54bdef0>

In [467]:
#0.14 467 RMSE in test
#0.12 in cv - 20 folds, parameters by cv - lr = 0.03, it - 934
model = CatBoostRegressor(bootstrap_type = 'Bernoulli', subsample = 1, iterations = 19631, one_hot_max_size = 15, random_strength = 1, l2_leaf_reg = 25, random_state = 42, depth = 2, loss_function='MAE', custom_metric = 'RMSE')

model.fit(full_dataset, verbose = False, plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x27ef20ebd68>

In [468]:
X_test_nan = pd.read_csv("test.csv")
y_test_log = model.predict(X_test)
y_test = np.exp(y_test_log)
output = pd.DataFrame({'Id': X_test_nan.Id, 'SalePrice': y_test})
output.to_csv('submission.csv', index = False)

In [466]:
from catboost import Pool, cv
cv_dataset = Pool(data=X_full,
                  label=y_full_log,
                  cat_features=cat_features)
#  bootstrap_type = 'Bernoulli', subsample = 1,  iterations = 30000, one_hot_max_size = 10,
# random_strength = 1.0, l2_leaf_reg = 25, random_state = 42, depth = 2, loss_function='MAE',
# custom_metric = 'RMSE')
scrores_list = []

params = {'bootstrap_type' : 'Bernoulli',
        'subsample' : 1,
        'iterations' : 30000,
        'l2_leaf_reg' : 25,
        'random_state' : 42,
        'one_hot_max_size' : 10,
        'random_strength' : 1,
        'depth' : 2,
        'loss_function' : 'MAE',
        'custom_metric' : 'RMSE'
        }
scores = cv(cv_dataset,
    params,
    fold_count=20,
    plot = True,
    verbose = 1000)


# model.grid_search(param_grid, cv_dataset,
#             partition_random_seed=0,
#             cv = 10,
#             refit=True,
#             verbose = True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 11.6638950	test: 11.6639530	best: 11.6639530 (0)	total: 493ms	remaining: 4h 6m 20s
1000:	learn: 0.0698078	test: 0.0820712	best: 0.0820712 (1000)	total: 10m 20s	remaining: 4h 59m 34s
2000:	learn: 0.0645988	test: 0.0807114	best: 0.0806930 (1936)	total: 22m 57s	remaining: 5h 21m 9s
3000:	learn: 0.0617380	test: 0.0802072	best: 0.0802013 (2997)	total: 37m 13s	remaining: 5h 34m 54s
4000:	learn: 0.0597171	test: 0.0801303	best: 0.0801155 (3971)	total: 52m 49s	remaining: 5h 43m 15s
5000:	learn: 0.0583445	test: 0.0800852	best: 0.0800654 (4375)	total: 1h 9m 31s	remaining: 5h 47m 32s
6000:	learn: 0.0571120	test: 0.0801257	best: 0.0799985 (5470)	total: 1h 27m 45s	remaining: 5h 50m 58s
7000:	learn: 0.0560559	test: 0.0800874	best: 0.0799609 (6624)	total: 1h 47m 2s	remaining: 5h 51m 39s
8000:	learn: 0.0552134	test: 0.0800049	best: 0.0799315 (7845)	total: 2h 7m 33s	remaining: 5h 50m 43s
9000:	learn: 0.0544559	test: 0.0799997	best: 0.0799315 (7845)	total: 2h 28m 43s	remaining: 5h 46m 58s
10000

In [459]:
scores.iloc[43285]

iterations         43285.000000
test-MAE-mean          0.082053
test-MAE-std           0.013186
train-MAE-mean         0.068284
train-MAE-std          0.001266
test-RMSE-mean         0.121441
test-RMSE-std          0.032556
train-RMSE-mean        0.105531
train-RMSE-std         0.002903
Name: 43285, dtype: float64

In [368]:
#no add
model = CatBoostRegressor(bagging_temperature = 9, iterations = 40000, one_hot_max_size = 15, random_strength = 1, l2_leaf_reg = 27, random_state = 42, depth = 2, loss_function='MAE', custom_metric = 'RMSE')

model.fit(train_dataset,
          use_best_model=True,
          eval_set=eval_dataset, verbose = False, plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

KeyboardInterrupt: 

In [349]:
#no add
model = CatBoostRegressor(bagging_temperature = 9, iterations = 6000, one_hot_max_size = 15, random_strength = 1, l2_leaf_reg = 27, random_state = 42)

model.fit(train_dataset,
          use_best_model=True,
          eval_set=eval_dataset, verbose = False, plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x27ea151e470>

In [273]:
fi = model.feature_importances_
print(fi.shape)
print(len(X_full.columns))
fi_df = pd.DataFrame({'feature' : X_full.columns, 'fi': fi})
# fi_df = fi_df.rename(columns={"0": "feature", "1": "fi"})
fi_df.sort_values(by = 'fi', ascending = False)
i_columns = fi_df.iloc[:31, 0]


(70,)
70


'MSZoning'

In [235]:
fi_df.head()

Unnamed: 0,feature,fi
0,MSSubClass,0.572902
1,MSZoning,0.110223
2,LotFrontage,0.918301
3,LotArea,2.144329
4,Alley,0.031666
