# Using MLPRegressor

I tuned it locally. The results anyway don't look promising...

Your submission scored 0.38225, which is not an improvement of your best score. Keep trying!


In [None]:
import os
import sys
import warnings

from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the currentd directory are saved as output.

# Loading data

Loading data from train and test file. Test file provides only input data and I'll predict the prices via using a model.

In [None]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

def get_cat_cols(df):
    return [col for col in df.columns if df[col].dtype == 'object']


y = np.log1p(train_data.SalePrice)
# test is meant for predictions and doesn't contain any price data. I need to provide it.
cand_train_predictors = train_data.drop(['Id', 'SalePrice'], axis=1)
cand_test_predictors = test_data.drop(['Id'], axis=1)

cat_cols = get_cat_cols(cand_train_predictors)

cand_train_predictors[cat_cols] = cand_train_predictors[cat_cols].fillna('NotAvailable')
cand_test_predictors[cat_cols] = cand_test_predictors[cat_cols].fillna('NotAvailable')

encoders = {}

for col in cat_cols:
    encoders[col] = LabelEncoder()
    val = cand_train_predictors[col].tolist()
    val.extend(cand_test_predictors[col].tolist())
    encoders[col].fit(val)
    cand_train_predictors[col] = encoders[col].transform(cand_train_predictors[col]) + 1
    cand_test_predictors[col] = encoders[col].transform(cand_test_predictors[col]) + 1

# for column in cand_train_predictors.columns:
#     cand_train_predictors[column].value_counts().sort_index().plot(kind="bar",legend=column,figsize = (24,8))
#     plt.show()

corr_matrix = cand_train_predictors.corr().abs()
# with pd.option_context('display.max_rows',None, 'display.max_columns',None):
#     print(corr_matrix.head())
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
cols_to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
print('Highly correlated features(will be droped):', cols_to_drop)

cand_train_predictors = cand_train_predictors.drop(cols_to_drop, axis=1)
cand_test_predictors = cand_test_predictors.drop(cols_to_drop, axis=1)

print(cand_train_predictors.shape)
print(cand_test_predictors.shape)

train_set, test_set = cand_train_predictors.align(cand_test_predictors, join='left', axis=1)
train_set = np.log1p(train_set)
test_set = np.log1p(test_set)


# Model

Using a pipeline to process input features:
1.  Features are imputed for missing values.
2. Scaled for outliers, etc with RobusScaler
3. Selected via feature importance computed via Lasso regression done viac cross validation


In [None]:
params = {}
train_set.fillna('NaN', inplace=True)

score_results = []
kfold = KFold(n_splits=10, random_state=1)
imputer = Imputer(axis=1, strategy='mean')
scaler = RobustScaler(with_centering=False, with_scaling=True, quantile_range=(20.0, 80.0))
select = SelectFromModel(LassoCV(cv=kfold, random_state=1), threshold='0.5*median')
regressor = MLPRegressor(random_state=1,
                         activation='logistic',
                         solver='sgd',
                         learning_rate='adaptive',
                         learning_rate_init=0.013000000000000001,
                         early_stopping=True,
                         hidden_layer_sizes=(140, 140),
                         max_iter=10000,
                         momentum=0.9697272727272728
                         )

pipe = make_pipeline(imputer, scaler, select, regressor)
my_model = GridSearchCV(pipe,
                        params,
                        cv=kfold,
                        scoring='neg_mean_squared_log_error',
                        verbose=10,
                        n_jobs=2,
                        error_score=-1000.)

my_model.fit(train_set, y)
print(-1 * my_model.score(train_set, y))
print(my_model.best_params_)

train_pred = my_model.predict(train_set)
print('rmsle: ', np.sqrt(mean_squared_log_error(y, train_pred)))
print('rmse: ', np.sqrt(mean_squared_error(train_data.SalePrice, np.expm1(train_pred))))
print('mae: ', mean_absolute_error(train_data.SalePrice, np.expm1(train_pred)))


# Predicting and submitting

Now it's time to predict from test.

In [None]:
#based on mae this model has worse results, but it will produce better results in the submission
test_set.fillna('NaN',inplace=True)
predicted_prices = np.expm1(my_model.predict(test_set))
print(predicted_prices[:5])

# print(len(predicted_prices))
# print(len(test_data.Id))

my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predicted_prices})
my_submission.Id = my_submission.Id.astype(int)
# print(my_submission.Id)
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)