In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [None]:
!head ../input/sample_submission.csv

In [None]:
train_df.head()

## Split categorical and Numerical feautures

The challenge already provide us information about each feature, so let's **use it**

**OBS**: If the data dictionary was not provided, one could use the [Benford's Law](https://en.wikipedia.org/wiki/Benford%27s_law) to check whether a numerical feature encodes a true numerical measurement or a categorical one through [Kullback-Leibler divergence test](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)

In [None]:
cat_features = ['MSZoning','Street','Alley','LotShape','LandContour',
                'Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
                'Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',
                'Exterior1st','Exterior2nd','MasVnrType','ExterQual',
                'ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                'BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical',
                'KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish',
                'GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']

Not all numerical features encode a true numerical measurement , so given the dictionary provided by the challenge, lets correctly split the numerical features 

In [None]:
true_num_features = ['WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch',
                     'PoolArea','MiscVal','MoSold','LotArea','BsmtFinSF1','BsmtFinSF2',
                     'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                     'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr',
                     'KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','GarageArea','LotFrontage','MasVnrArea']

In [None]:
cat_num_features = ['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd','YrSold']

## Check Null

In [None]:
for c in cat_features:
    print("{} {}".format(c,len(train_df[pd.isnull(train_df[c])])/len(train_df) ) )

Lets take a look at **PoolQC**, **Fence**, **Alley**, **MiscFeature**, once they are the ones with high null values

In [None]:
train_df['SalePrice'] = train_df['SalePrice'].astype('float32')

In [None]:
plt.clf()
for i, feat_name in enumerate(['PoolQC','Fence','Alley','MiscFeature']):
    plt.figure(i,figsize=(10,5))
    plt.title(feat_name)    
    for group_name, group in train_df[~pd.isnull(train_df[feat_name])][[feat_name,'SalePrice']].groupby(feat_name):    
        if len(group) >1 :
            sns.distplot(group['SalePrice'],kde=False,label=group_name,norm_hist=True)
    plt.legend()
    plt.show()


As these features possess high null values and are not too much correlated with the target variable **SalePrice**, lets **discard them**.

I prefer to drop these features in order to let the ML model **lighter**.

Lets take a look on the categorical variables

In [None]:
plt.clf()
for i, feat_name in enumerate(cat_features):
    plt.figure(i,figsize=(10,5))
    plt.title(feat_name)    
    for group_name, group in train_df[~pd.isnull(train_df[feat_name])][[feat_name,'SalePrice']].groupby(feat_name):    
#         print(group)
        if len(group) >1 :
            sns.distplot(group['SalePrice'],kde=False,label=group_name,norm_hist=True)
    plt.legend()
    plt.show()


Just the feature **Utilities** posses a single value, so lets drop it once it is not discriminative.

Although the other features possess high overlapping distributions among the possible values and **SalePrice**, we hope that there are some multivariate relationship among them and the target variable

Lets take a look at numerical features

In [None]:
for feat_name in true_num_features:
    feat_non_null = train_df[~pd.isnull(train_df[feat_name])][[feat_name,'SalePrice']]
    print('Pearson Correlation coefficient between {} and {}: {}'.format('SalesPrice',feat_name,scipy.stats.pearsonr(feat_non_null[feat_name],feat_non_null['SalePrice'])))

The features  **3SsnPorch**, **PoolArea**, **MiscVal**, **MoSold**, **YrSold**, **BsmtFinSF2**, **LowQualFinSF**, **BsmtHalfBath** possess pretty low correlation with the target, so lets **drop them** and let the ML **model** even **lighter**

In [None]:
filtered_cat_features = ['MSZoning','Street','LotShape','LandContour',
                'Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
                'Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',
                'Exterior1st','Exterior2nd','MasVnrType','ExterQual',
                'ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                'BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical',
                'KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish',
                'GarageQual','GarageCond','PavedDrive','SaleType','SaleCondition']



filtered_num_features = ['WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch',
                     'LotArea','BsmtFinSF1',
                     'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea',
                     'BsmtFullBath','FullBath','HalfBath','BedroomAbvGr',
                     'KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','GarageArea','LotFrontage','MasVnrArea']

filtered_cat_num_features = ['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd']

In [None]:
filtered_train_df = train_df[filtered_cat_features+filtered_num_features+filtered_cat_num_features+['SalePrice','Id']]
filtered_test_df = test_df[filtered_cat_features+filtered_num_features+filtered_cat_num_features+['Id']]

Scale target variable to avoid numerical issues on gradient descent

In [None]:
filtered_train_df['SalePrice'] = np.log1p(filtered_train_df['SalePrice'])

### Other approaches for feature selection

One may use other and more sophisticated approaches to deal with feature selection. as:

* XGBoost by Feature Importance
* Dimensionality reduction using PCA and/or t-SNE
* Coefficient analsysis of a linear regressor

As my time is short, lets keep with this simple, but not bad, approach.

### Missing values

As we now have low percentage of missing values, lets fill them

In [None]:
for feat in filtered_cat_features:
    filtered_train_df[feat].fillna('nan',inplace=True)
    filtered_test_df[feat].fillna('nan',inplace=True)
    
for feat in filtered_num_features:
    feat_mean = filtered_train_df[feat].mean()
    filtered_train_df[feat].fillna(feat_mean,inplace=True)
    filtered_test_df[feat].fillna(feat_mean,inplace=True) # Yes, let's fill the test set with the mean of train set, otherwise it would be cheat.    
    
for feat in filtered_cat_num_features:
    feat_mean = str(int(filtered_train_df[feat].mean()))
    filtered_train_df[feat].fillna(feat_mean,inplace=True)
    filtered_test_df[feat].fillna(feat_mean,inplace=True)

### Feature Normalization

Lets apply z-norm for numerical features and one-hot encoding for categorical ones

But why **one-hot encoding**??? Embeddings has already been discovered!!!!! I don't think embeddings is worth applying for this problem one the categorical features has low cardinality

We don't need to perform one-hot encoding by hand if we are going to use [TensorFlow Feature Columns](https://www.tensorflow.org/guide/feature_columns)

In [None]:
for feat in filtered_num_features:
    feat_mean = filtered_train_df[feat].mean()
    feat_std = filtered_train_df[feat].std()
    filtered_train_df[feat] = filtered_train_df[feat].apply(lambda x: (x-feat_mean)/feat_std )
    filtered_test_df[feat] = filtered_test_df[feat].apply(lambda x: (x-feat_mean)/feat_std )

## TensorFlow Model

In [None]:
import tensorflow as tf

### Define feature columns

In [None]:
feat_column_cat = []

for feat_name in filtered_cat_features+filtered_cat_num_features:
    feat_column_cat.append(tf.feature_column.categorical_column_with_vocabulary_list(feat_name,list(filtered_train_df[feat_name].unique())))

In [None]:
feat_column_num = []

for feat_name in filtered_num_features:
    feat_column_num.append(tf.feature_column.numeric_column(feat_name))

In [None]:
len(feat_column_num)

### Define input function

In [None]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(filtered_train_df.drop('SalePrice',axis=1),filtered_train_df['SalePrice'],num_epochs=300,batch_size=128,shuffle=True)
test_input_fn = tf.estimator.inputs.pandas_input_fn(filtered_test_df,batch_size=128,shuffle=True)

### Define model

In [None]:
model = tf.estimator.DNNLinearCombinedRegressor('./wide_n_deep/',
                                               linear_feature_columns=feat_column_cat,
                                               dnn_feature_columns=feat_column_num,
                                               dnn_hidden_units=[23,16,4])

In [None]:
model = tf.contrib.estimator.forward_features(model,'Id')

In [None]:
!rm -rf wide_n_deep/

In [None]:
model.train(train_input_fn)

## Make Predictions

In [None]:
submission_dict = {'Id':[],'SalePrice':[]}
for prediction in model.predict(test_input_fn):    
    submission_dict['Id'].append(prediction['Id'])
    submission_dict['SalePrice'].append(np.expm1(prediction['predictions'][0]))
submission = pd.DataFrame(submission_dict)

In [None]:
submission.head()

In [None]:
submission.to_csv('submussion.csv',index=False)