In [31]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import patsy
import statsmodels.api as sm

from sklearn import metrics
from sklearn.metrics import r2_score, recall_score, make_scorer, f1_score, mean_squared_error

from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV , RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression

from sklearn.preprocessing import StandardScaler, PolynomialFeatures #PowerTransformer

from sklearn.neighbors import KNeighborsRegressor


%config InlineBackend.figure_format = 'retina'
%matplotlib inline

pd.set_option('max_columns',300)

In [32]:
X = pd.read_csv('../datasets/data_clean.csv')

In [33]:
y = X['SalePrice']
X.drop('SalePrice', axis=1, inplace=True)

Seems like the significant variables do not really mirror the skew of Sale Price, so I'll go ahead and apply the log transform to Sale Price only.

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

param_grid_1 = {
    'alpha':[.1,.3,.5,.6,.7,.8,1,1.5],
    'l1_ratio':[0,.3,.5,.7,1]
}

gs_simple = GridSearchCV(ElasticNet(),param_grid_1,cv=5,verbose=1)

gs_simple.fit(X_train_sc,y_train_log)  

Fitting 5 folds for each of 40 candidates, totalling 200 fits




[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   41.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 1, 1.5], 'l1_ratio': [0, 0.3, 0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [35]:
gs_simple.best_score_

0.8611790233367005

In [36]:
gs_simple.best_estimator_

ElasticNet(alpha=0.6, copy_X=True, fit_intercept=True, l1_ratio=0,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [37]:
pd.DataFrame(gs_simple.best_estimator_.coef_,index=X_train.columns).sort_values(0)

Unnamed: 0,0
Functional,-0.018079
Misc Val,-0.017049
Neighborhood_dummies_Edwards,-0.013726
MS SubClass_30,-0.012446
Neighborhood_dummies_MeadowV,-0.012098
Neighborhood_dummies_IDOTRR,-0.010716
MS SubClass_160,-0.008626
Garage Type_Detchd,-0.008398
Roof Style_Mansard,-0.007980
Neighborhood_dummies_OldTown,-0.007914


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

gs_simple.best_estimator_.score(X_test_sc,y_test_log)

0.9203114210423116

### Actual Test Application, Model 2 (GridSearch with Elastic Net and Log Transform of Y)

In [39]:
testset = pd.read_csv('../datasets/test_clean.csv')

In [40]:
gs_en_log = gs_simple.best_estimator_

ss = StandardScaler()
X_sc = ss.fit_transform(X)
testset_sc = ss.transform(testset)

y_log = np.log(y)

gs_en_log.fit(X_sc,y_log)  

predictions = gs_en_log.predict(testset_sc)



In [41]:
predictions

array([11.64108652, 12.03624794, 12.30645547, 11.54222873, 12.08697937,
       11.48838196, 11.59925705, 11.92017284, 12.04918059, 11.95252513,
       12.00575364, 11.61766091, 11.84869135, 12.54359076, 11.54143859,
       11.78286629, 11.95026805, 11.65438132, 12.14789456, 12.10206124,
       12.02101808, 11.79003559, 12.14719851, 12.09578963, 12.07413466,
       11.75556037, 11.86096435, 11.7187531 , 12.03067389, 11.22148033,
       11.65705885, 11.45040454, 11.76393241, 11.95769813, 12.29947196,
       12.06585918, 11.6246999 , 11.3926478 , 11.85031007, 12.14298747,
       12.03834325, 12.26951025, 11.91427193, 11.86772853, 12.2829612 ,
       11.46825504, 12.310402  , 11.79120177, 11.82937814, 11.72258125,
       11.58609863, 12.07748795, 12.36761613, 11.71756567, 11.67519363,
       11.94442407, 11.94122668, 11.84220161, 11.95026626, 12.35026689,
       12.59161953, 11.67787434, 11.89245201, 12.0010943 , 12.15270728,
       12.65605221, 11.55453751, 12.225162  , 11.52978344, 12.02

In [46]:
predictions = np.exp(predictions)
predictions

array([113673.60653694, 168762.5446938 , 221118.8173172 , 102973.68263861,
       177544.99806037,  97575.52389591, 109016.77472666, 150267.57890912,
       170959.26465083, 155208.57408794, 163693.92360355, 111041.68043158,
       139901.14531945, 280292.99864961, 102892.35191148, 130988.69417478,
       154858.65092187, 115194.96515528, 188696.38234206, 180243.00346862,
       166211.78691861, 131931.16587417, 188565.08607954, 179116.12746413,
       175279.06921724, 127460.31927609, 141628.7322337 , 122854.14327708,
       167824.46959788,  74718.30063438, 115503.81589968,  93939.34040329,
       128531.90128889, 156013.54827671, 219580.01129329, 173834.53586043,
       111826.05955587,  88667.42061081, 140127.79050175, 187772.6997244 ,
       169116.52415519, 213098.6000709 , 149383.47386619, 142589.98107904,
       215984.34359665,  95631.26069137, 221993.1924109 , 132085.10982685,
       137225.12949578, 123325.34764279, 107591.68366733, 175867.81615917,
       235064.71388999, 1

In [48]:
testset['SalePrice'] = predictions

In [49]:
submission = testset[['Id','SalePrice']]

In [50]:
submission.to_csv('../datasets/ef_ames_12_6_B.csv',index=False)

In [51]:
submission

Unnamed: 0,Id,SalePrice
0,2658,113673.606537
1,2718,168762.544694
2,2414,221118.817317
3,1989,102973.682639
4,625,177544.998060
5,333,97575.523896
6,1327,109016.774727
7,858,150267.578909
8,95,170959.264651
9,1568,155208.574088
