In [None]:
import numpy as np 
import pandas as pd 

from pctl_scale import PercentileScaler  # pip install pctl-scale
from sklearn.preprocessing import RobustScaler

## Summary
* X: Only ratio-scale with many distinct values (e.g. square feet something). Not using nominal or ordinal-scale variables, nor ratio-scale with few distinct values (e.g. number of bathrooms)
* Data prep: `pctl_scale.PercentileScaler` to transform all values within the 5% and 95% percentile like MinMax and all outliers with growth saturations formulas towards 0 or 1. 
* Missing Values are set `0.0` assuming that the feature just don't exist for the example. For example if the measure "kitchen size in square feet" is missing, maybe there is no kitchen at all.
* y: sklearn's `RobustScaler` is used
* Model assumptions: Multiplies weights with input data in some way. Thus, multiplying with `0.0` will automagically ignore missing values (set to `0.0`)

What model?

* Linear Regression
* Baseline model
    * Identify high correlations between target and predictors $|\rho(y, x_i)|>0.4$ with p-values below 0.01
    * For given $x_i$ (see before) find pairs $(x_i, x_j)$ with a high p-value indicating a poor relationship
    * Estimate $y=\theta_0 + \theta_1 x_i + \sum_{j=2}^{?} \theta_j x_j + \epsilon$


## Data Prep

In [None]:
def dataprep_fit(df):
    #df2 = df.copy()
    transformer = dict()

    # X with PercentileScaler
    col_predictor = [
        'LotArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
        '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
        'PoolArea', 'MiscVal', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt']
    lo = .05
    up = .95
    naimpute = 0
    
    for i, s in enumerate(col_predictor):
        # compute percentiles
        obj = PercentileScaler(upper=up, lower=lo, naimpute=naimpute)
        obj.fit(df[s])
        # store and apply
        transformer[s] = obj
        #df2[s] = obj.transform(df[s])  
    
    # y with RobustScaler
    col_target = ['SalePrice']

    for i, s in enumerate(col_target):
        tmp = df[s].values.reshape(-1, 1)
        obj = RobustScaler()
        obj.fit(tmp)
        # store and apply
        transformer[s] = obj
        #df2[s] = obj.transform(tmp)  
        
    # done
    return transformer, col_predictor, col_target

In [None]:
def dataprep_transform(df, transformer, xcols, ycols):
    x = pd.DataFrame(index=df.index)
    for i, s in enumerate(xcols):
        obj = transformer[s]
        x[s] = obj.transform(df[s].values.reshape(-1, 1))

    if ycols:
        y = pd.DataFrame(index=df.index)
        for i, s in enumerate(ycols):
            obj = transformer[s]
            y[s] = obj.transform(df[s].values.reshape(-1, 1))
    else:
        y = None
        
    return x, y

In [None]:
#df = pd.read_csv('../input/train.csv', dtype=str)  # throws errors
df = pd.read_csv('../input/train.csv')

# fit transform
transformer, xcols, ycols = dataprep_fit(df)
x0, y0 = dataprep_transform(df, transformer, xcols, ycols)

In [None]:
#y0['SalePrice'].describe()
#x0[col_predictor].applymap(lambda e: e if e>0 else np.nan).describe()

In [None]:
y = y0.values #.reshape(-1,1)
x = x0.values

## Find the highest correlation with y

In [None]:
from scipy.stats import pearsonr

n = x.shape[1]
rho = np.empty(shape=(n,))
pval = np.empty(shape=(n,))

for i in range(n):
    idx = x[:,i] > 0
    rho[i], pval[i] = pearsonr(y[idx].ravel(), x[idx,i].ravel())

The results are still a little bit messy

In [None]:
np.c_[rho, pval].round(3)

Let's flag sufficient correlations

In [None]:
# only consider absolute correlation above 0.4 with p-values below 0.01
candidates = np.logical_and(np.abs(rho) > 0.4, pval < 0.01)
candidates

Sort by the absolute correlation and set insufficient correlation to zero while sorting (so they appear at the bottom)

In [None]:
idx = np.argsort(np.abs(rho) * candidates)[::-1]
idx

In [None]:
pd.DataFrame(index=idx, data=np.c_[col_predictor, rho.round(3), pval.round(4)][idx])

`GrLivArea` or `x_8` will be the first predictor.
Having a big living room seems to be a very big selling point.

## Find a second predictor that is uncorrelated to the first predictor

In [None]:
def compute_corr(y, x, colnam, rhomin=None, pmax=None, rhomax=None, pmin=None, sort='desc'):
    from scipy.stats import pearsonr

    n = x.shape[1]
    rho = np.empty(shape=(n,))
    pval = np.empty(shape=(n,))

    for i in range(n):
        idx = x[:,i] > 0
        rho[i], pval[i] = pearsonr(y[idx].ravel(), x[idx,i].ravel())
        
    # only consider absolute correlation above 0.4 with p-values below 0.01
    cand = np.ones(shape=(n,), dtype=bool)
    if rhomin: cand = np.logical_and(cand, np.abs(rho) > rhomin)
    if pmax: cand = np.logical_and(cand, pval < pmax)
    if rhomax: cand = np.logical_and(cand, np.abs(rho) < rhomax)
    if pmin: cand = np.logical_and(cand, pval > pmin)
    
    arr = -1 * np.abs(rho) * cand
    if sort=='desc': arr = np.abs(rho) * cand + 10 * np.logical_not(cand)
    idx = np.argsort(arr, axis=0)

    return pd.DataFrame(index=idx, data=np.c_[cand, colnam, rho, pval][idx])


In [None]:
"""this usually never works: find pval(xi,xj)>0.05 and both high cor 
xidx = (6,9,4,5,19,1,0,18)
res = compute_corr(
    x[:,8], x[:, xidx], 
    [s for i,s in enumerate(col_predictor) if i in xidx], 
    pmin=0.01, sort='asc')
"""
res = compute_corr(x[:,8], x, col_predictor, pmin=0.05, sort='desc')
res.head(5)

All these high p-value variables have a lot of missing data.
Thus, will use them as kind of dummy variables.

In [None]:
colnam2 = ['BsmtFinSF2', '3SsnPorch', 'MiscVal', 'LowQualFinSF', 'PoolArea']
tmp = x0[colnam2].applymap(lambda e: e if e>0 else np.nan)
tmp.describe()

## Build a model

In [None]:
xcols = ['GrLivArea', 'BsmtFinSF2', '3SsnPorch', 'MiscVal', 'LowQualFinSF', 'PoolArea']
y = y0.values
X = x0[xcols].values

some splitting

In [None]:
y_train = y[:1200]
X_train = X[:1200,:]

y_valid = y[1201:]
X_valid = X[1201:,:]

### Linear Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

hyperparam = {
    #"fit_intercept": [True, False]
}

opti = GridSearchCV(
    estimator = LinearRegression(
        normalize=False,
        copy_X=True,
        fit_intercept=True
    ),
    param_grid = hyperparam, 
    cv = 5,
    n_jobs = -1,
    return_train_score = True
)

opti.fit(X=X_train, y=y_train)

print(opti.best_estimator_, "\n",
      opti.best_params_, "\n")

print("{0:8.4f} [CV average score of the best model]".format(
      opti.best_score_ ) )

bestmodel = opti.best_estimator_
print("{0:8.4f} [Performance on the leave-one out validation/test set]".format(
      r2_score(y_valid, bestmodel.predict(X_valid))) )

Submit it

In [None]:
df_test = pd.read_csv('../input/test.csv')

xcols = ['GrLivArea', 'BsmtFinSF2', '3SsnPorch', 'MiscVal', 'LowQualFinSF', 'PoolArea']
x_test, _ = dataprep_transform(df_test, transformer, xcols, None)

y_output = bestmodel.predict(x_test.values)
y_predicted = transformer['SalePrice'].inverse_transform(y_output)

result = pd.DataFrame(columns=['Id', 'SalePrice'], index=df_test.index)
result['Id'] = df_test['Id']
result['SalePrice'] = y_predicted

#result
result.to_csv('linear-regression-1.csv', index=False)