In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import patsy
import statsmodels.api as sm

from sklearn import metrics
from sklearn.metrics import r2_score, recall_score, make_scorer, f1_score, mean_squared_error

from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV , RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression

from sklearn.preprocessing import StandardScaler, PolynomialFeatures #PowerTransformer

from sklearn.neighbors import KNeighborsRegressor


%config InlineBackend.figure_format = 'retina'
%matplotlib inline

pd.set_option('max_columns',300)

In [2]:
X = pd.read_csv('../datasets/data_clean_final.csv')

### Polynomial Features

In [3]:
y = X[['SalePrice']]
X.drop('SalePrice', axis=1, inplace=True)

In [4]:
poly = PolynomialFeatures(include_bias=False)
X_poly = poly.fit_transform(X)

In [5]:
X_poly.shape

(2048, 22790)

In [6]:
y.shape

(2048, 1)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_poly,y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

param_grid_1 = {
    'alpha':[.1,.3,.5,.7,1,5,10],
    'l1_ratio':[0,.3,.5,.7,1]
}

gs_simple = GridSearchCV(ElasticNet(),param_grid_1,cv=5,verbose=1)

gs_simple.fit(X_train_sc,y_train_log)

Fitting 5 folds for each of 35 candidates, totalling 175 fits




[Parallel(n_jobs=1)]: Done 175 out of 175 | elapsed: 49.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.3, 0.5, 0.7, 1, 5, 10], 'l1_ratio': [0, 0.3, 0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [29]:
gs_simple.best_estimator_

ElasticNet(alpha=5, copy_X=True, fit_intercept=True, l1_ratio=0,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [30]:
gs_simple.best_score_

0.8900284350012736

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_poly,y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

gs_simple.best_estimator_.score(X_test_sc,y_test_log)

0.4743659869174198

Clearly this is woefully overfit.  My next step will be to just look at the highest correlated interaction effects and squares:

### Identifying Useful Polynomial Features and Adding Them to X

In [4]:
X = pd.read_csv('../datasets/data_clean.csv')

In [5]:
y = X[['SalePrice']]
X.drop('SalePrice', axis=1, inplace=True)

In [7]:
features = list(X.columns[2:])

In [8]:
X1 = X[features]

In [9]:
X_poly = poly.fit_transform(X1)

In [11]:
len(poly.get_feature_names(features))

22365

In [10]:
X_poly = pd.DataFrame(X_poly,columns=poly.get_feature_names(features))

In [11]:
X_poly['SalePrice'] = y

In [17]:
X_poly_corrs = X_poly.corrwith(X_poly['SalePrice']).sort_values(ascending=False)

In [None]:
X_poly = pd.DataFrame(X_poly,columns=poly.get_feature_names(features))
X_poly['SalePrice'] = y

In [28]:
abs(X_poly_corrs).sort_values(ascending=False)[1:50]

Overall Qual Gr Liv Area        0.836925
Overall Qual^2                  0.826386
Overall Qual Garage Cars        0.820607
Overall Qual Bsmt Qual          0.819747
Overall Qual Garage Area        0.813707
Bsmt Qual Gr Liv Area           0.811035
Overall Qual Year Built         0.807690
Overall Qual Garage Yr Blt      0.806738
Overall Qual Year Remod/Add     0.805478
Overall Qual                    0.800975
Overall Qual Yr Sold            0.800953
Overall Qual TotRms AbvGrd      0.795133
Gr Liv Area Garage Cars         0.793304
Overall Qual 1st Flr SF         0.792189
Overall Qual Street_Pave        0.791525
Garage Cars Kitchen Qual 2      0.790388
Garage Area Kitchen Qual 2      0.788109
Garage Cars Exter Qual 2        0.781557
Gr Liv Area Kitchen Qual 2      0.776863
Garage Area Exter Qual 2        0.776824
Gr Liv Area Exter Qual 2        0.774572
Bsmt Qual Garage Area           0.770933
Bsmt Qual Garage Cars           0.770507
Overall Qual Total Bsmt SF      0.770244
Total Bsmt SF Ga

In [29]:
new_features = abs(X_poly_corrs).sort_values(ascending=False)[1:50].index

In [30]:
X = pd.read_csv('../datasets/data_clean.csv')

In [31]:
X[new_features] = X_poly[new_features]

In [32]:
X.head()

Unnamed: 0.1,Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Alley,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating QC,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Val,Yr Sold,SalePrice,MS SubClass_30,MS SubClass_40,MS SubClass_45,MS SubClass_50,MS SubClass_60,MS SubClass_70,MS SubClass_75,MS SubClass_80,MS SubClass_85,MS SubClass_90,MS SubClass_120,MS SubClass_160,MS SubClass_180,MS SubClass_190,MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM,Street_Pave,Land Contour_HLS,Land Contour_Low,Land Contour_Lvl,Condition 1_Feedr,Condition 1_Norm,Condition 1_PosA,Condition 1_PosN,Condition 1_RRAe,Condition 1_RRAn,Condition 1_RRNe,Condition 1_RRNn,Condition 2_Norm,Condition 2_PosA,Condition 2_PosN,House Style_1.5Unf,House Style_1Story,House Style_2.5Fin,House Style_2.5Unf,House Style_2Story,House Style_SFoyer,House Style_SLvl,Lot Config_CulDSac,Lot Config_FR2,Lot Config_FR3,Lot Config_Inside,Bldg Type_2fmCon,Bldg Type_Duplex,Bldg Type_Twnhs,Bldg Type_TwnhsE,Neighborhood_dummies_Blueste,Neighborhood_dummies_BrDale,Neighborhood_dummies_BrkSide,Neighborhood_dummies_ClearCr,Neighborhood_dummies_CollgCr,Neighborhood_dummies_Crawfor,Neighborhood_dummies_Edwards,Neighborhood_dummies_Gilbert,Neighborhood_dummies_Greens,Neighborhood_dummies_IDOTRR,Neighborhood_dummies_MeadowV,Neighborhood_dummies_Mitchel,Neighborhood_dummies_NAmes,Neighborhood_dummies_NPkVill,Neighborhood_dummies_NWAmes,Neighborhood_dummies_NoRidge,Neighborhood_dummies_NridgHt,Neighborhood_dummies_OldTown,Neighborhood_dummies_SWISU,Neighborhood_dummies_Sawyer,Neighborhood_dummies_SawyerW,Neighborhood_dummies_Somerst,Neighborhood_dummies_StoneBr,Neighborhood_dummies_Timber,Neighborhood_dummies_Veenker,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed,Roof Matl_CompShg,Roof Matl_Membran,Roof Matl_Tar&Grv,Roof Matl_WdShake,Roof Matl_WdShngl,Exterior 1st_AsphShn,Exterior 1st_BrkComm,Exterior 1st_BrkFace,Exterior 1st_CemntBd,Exterior 1st_HdBoard,Exterior 1st_MetalSd,Exterior 1st_Plywood,Exterior 1st_Stucco,Exterior 1st_VinylSd,Exterior 1st_Wd Sdng,Exterior 1st_WdShing,Exterior 2nd_AsphShn,Exterior 2nd_Brk Cmn,Exterior 2nd_BrkFace,Exterior 2nd_CBlock,Exterior 2nd_CmentBd,Exterior 2nd_HdBoard,Exterior 2nd_ImStucc,Exterior 2nd_MetalSd,Exterior 2nd_Plywood,Exterior 2nd_Stucco,Exterior 2nd_VinylSd,Exterior 2nd_Wd Sdng,Exterior 2nd_Wd Shng,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_GasW,Heating_Grav,Heating_OthW,Central Air_Y,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD,Season_Spring,Season_Summer,Season_Winter,Mo Sold_2,Mo Sold_3,Mo Sold_4,Mo Sold_5,Mo Sold_6,Mo Sold_7,Mo Sold_8,Mo Sold_9,Mo Sold_10,Mo Sold_11,Mo Sold_12,Exter Qual 2,Exter Cond 2,Kitchen Qual 2,Garage Type_Attchd,Garage Type_Basment,Garage Type_BuiltIn,Garage Type_CarPort,Garage Type_Detchd,Garage Type_None,Alley Grvl,Alley Pave,Misc Feature_Gar2,Misc Feature_Othr,Misc Feature_Shed,Mas Vnr Type_BrkCmn,Mas Vnr Type_BrkFace,Mas Vnr Type_Stone,Overall Qual Gr Liv Area,Overall Qual^2,Overall Qual Garage Cars,Overall Qual Bsmt Qual,Overall Qual Garage Area,Bsmt Qual Gr Liv Area,Overall Qual Year Built,Overall Qual Garage Yr Blt,Overall Qual Year Remod/Add,Overall Qual Yr Sold,Overall Qual TotRms AbvGrd,Gr Liv Area Garage Cars,Overall Qual 1st Flr SF,Overall Qual Street_Pave,Garage Cars Kitchen Qual 2,Garage Area Kitchen Qual 2,Garage Cars Exter Qual 2,Gr Liv Area Kitchen Qual 2,Garage Area Exter Qual 2,Gr Liv Area Exter Qual 2,Bsmt Qual Garage Area,Bsmt Qual Garage Cars,Overall Qual Total Bsmt SF,Total Bsmt SF Garage Cars,Overall Qual Kitchen Qual 2,Overall Qual Exter Qual 2,1st Flr SF Garage Cars,TotRms AbvGrd Exter Qual 2,1st Flr SF Kitchen Qual 2,Overall Qual Full Bath,Gr Liv Area Garage Area,1st Flr SF Exter Qual 2,Total Bsmt SF Kitchen Qual 2,Bsmt Qual 1st Flr SF,Bsmt Qual Exter Qual 2,TotRms AbvGrd Kitchen Qual 2,Bsmt Qual Kitchen Qual 2,Kitchen Qual Garage Cars,Full Bath Kitchen Qual 2,Gr Liv Area Garage Finish,Garage Finish Kitchen Qual 2,Total Bsmt SF Exter Qual 2,Garage Finish Garage Area,Exter Qual 2 Kitchen Qual 2,Kitchen Qual Garage Area,Garage Finish Exter Qual 2,Full Bath Garage Area,Fireplace Qu Exter Qual 2
0,0,109,533352170,72.774648,13517,0,1,0,0,6.0,8,1976,2005,289.0,1,0,2.5,2.5,0,4.0,533.0,1.5,0.0,192.0,725.0,2,0.0,725,754,0,1479,0.0,0.0,2,1,3,1,1,6,0,0,0.0,1976.0,2,2.0,475.0,2.5,2.5,2,0,44,0,0,0,0,0,0,0,2010,130500,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,8874.0,36.0,12.0,15.0,2850.0,3697.5,11856.0,11856.0,12030.0,12060.0,36.0,2958.0,4350.0,6.0,2.0,475.0,2.0,1479.0,475.0,1479.0,1187.5,5.0,4350.0,1450.0,6.0,6.0,1450.0,6.0,725.0,12.0,702525.0,725.0,725.0,1812.5,2.5,6.0,2.5,2.0,2.0,2958.0,2.0,725.0,950.0,1.0,475.0,2.0,950.0,0.0
1,1,544,531379050,43.0,11492,0,1,0,0,7.0,5,1996,1997,132.0,1,0,3.0,2.5,0,4.0,637.0,1.5,0.0,276.0,913.0,2,0.0,913,1209,0,2122,1.0,0.0,2,1,4,1,1,8,0,1,2.5,1997.0,2,2.0,559.0,2.5,2.5,2,0,74,0,0,0,0,0,0,0,2009,220000,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,14854.0,49.0,14.0,21.0,3913.0,6366.0,13972.0,13979.0,13979.0,14063.0,56.0,4244.0,6391.0,7.0,2.0,559.0,2.0,2122.0,559.0,2122.0,1677.0,6.0,6391.0,1826.0,7.0,7.0,1826.0,8.0,913.0,14.0,1186198.0,913.0,913.0,2739.0,3.0,8.0,3.0,2.0,2.0,4244.0,2.0,913.0,1118.0,1.0,559.0,2.0,1118.0,2.5
2,2,153,535304180,68.0,7922,0,0,0,0,5.0,7,1953,2007,0.0,0,1,2.5,2.5,0,4.0,731.0,1.5,0.0,326.0,1057.0,0,0.0,1057,0,0,1057,1.0,0.0,1,0,3,1,1,5,0,0,0.0,1953.0,1,1.0,246.0,2.5,2.5,2,0,52,0,0,0,0,0,0,0,2010,109000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5285.0,25.0,5.0,12.5,1230.0,2642.5,9765.0,9765.0,10035.0,10050.0,25.0,1057.0,5285.0,5.0,1.0,246.0,0.0,1057.0,0.0,0.0,615.0,2.5,5285.0,1057.0,5.0,0.0,1057.0,0.0,1057.0,5.0,260022.0,0.0,1057.0,2642.5,0.0,5.0,2.5,1.0,1.0,1057.0,1.0,0.0,246.0,0.0,246.0,0.0,246.0,0.0
3,3,318,916386060,73.0,9802,0,0,0,0,5.0,5,2006,2007,0.0,0,0,3.0,2.5,0,1.5,0.0,1.5,0.0,384.0,384.0,1,0.0,744,700,0,1444,0.0,0.0,2,1,3,1,0,7,0,0,0.0,2007.0,3,2.0,400.0,2.5,2.5,2,100,0,0,0,0,0,0,0,0,2010,174000,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,7220.0,25.0,10.0,15.0,2000.0,4332.0,10030.0,10035.0,10035.0,10050.0,35.0,2888.0,3720.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1200.0,6.0,1920.0,768.0,0.0,0.0,1488.0,0.0,0.0,10.0,577600.0,0.0,0.0,2232.0,0.0,0.0,0.0,0.0,0.0,4332.0,0.0,0.0,1200.0,0.0,0.0,0.0,800.0,0.0
4,4,255,906425045,82.0,14235,0,1,0,0,6.0,8,1900,1993,0.0,0,0,2.0,3.0,0,1.5,0.0,1.5,0.0,676.0,676.0,0,0.0,831,614,0,1445,0.0,0.0,2,0,3,1,0,6,0,0,0.0,1957.0,1,2.0,484.0,2.5,2.5,0,0,59,0,0,0,0,0,0,0,2010,138500,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,8670.0,36.0,12.0,12.0,2904.0,2890.0,11400.0,11742.0,11958.0,12060.0,36.0,2890.0,4986.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,968.0,4.0,4056.0,1352.0,0.0,0.0,1662.0,0.0,0.0,12.0,699380.0,0.0,0.0,1662.0,0.0,0.0,0.0,0.0,0.0,1445.0,0.0,0.0,484.0,0.0,0.0,0.0,968.0,0.0


In [34]:
X[['Overall Qual','Gr Liv Area','Overall Qual Gr Liv Area']].head()

Unnamed: 0,Overall Qual,Gr Liv Area,Overall Qual Gr Liv Area
0,6.0,1479,8874.0
1,7.0,2122,14854.0
2,5.0,1057,5285.0
3,5.0,1444,7220.0
4,6.0,1445,8670.0


In [35]:
X.to_csv('../datasets/data_clean_poly.csv')

### Grid Search ElasticNet with Poly Features

In [36]:
y = X[['SalePrice']]
X.drop('SalePrice', axis=1, inplace=True)

In [37]:
X.shape

(2048, 263)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

param_grid_1 = {
    'alpha':[.1,.3,.5,.6,.7,.8,1,1.5],
    'l1_ratio':[0,.3,.5,.7,1]
}

gs_simple = GridSearchCV(ElasticNet(),param_grid_1,cv=5,verbose=1)

gs_simple.fit(X_train_sc,y_train_log)

Fitting 5 folds for each of 40 candidates, totalling 200 fits




[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   42.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 1, 1.5], 'l1_ratio': [0, 0.3, 0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [39]:
gs_simple.best_score_

0.8736310937539941

In [40]:
pd.DataFrame(gs_simple.best_estimator_.coef_,index=X_train.columns).sort_values(0)

Unnamed: 0,0
Functional,-0.014798
Pool QC,-0.013482
MS SubClass_30,-0.012647
Neighborhood_dummies_IDOTRR,-0.012406
Heating_Grav,-0.011886
Neighborhood_dummies_Edwards,-0.010549
Neighborhood_dummies_MeadowV,-0.009846
MS SubClass_160,-0.008376
Total Bsmt SF Exter Qual 2,-0.008303
Neighborhood_dummies_OldTown,-0.008217


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

gs_simple.best_estimator_.score(X_test_sc,y_test_log)

0.9229760124729974

### Apply Changes to the Test.csv for Model 3

In [58]:
testset3 = pd.read_csv('../datasets/test_clean.csv')

In [47]:
#I need to create all the polynomial features for the test set, and then pass in the feature names list
#from the top correlated that I took from the train.csv set.

In [59]:
def changes_to_testset(X):
    
    features = list(X.columns[2:])
    X1 = X[features]
    new_poly = poly.fit_transform(X1)
    poly.get_feature_names(features)
    new_poly = pd.DataFrame(X_poly,columns=poly.get_feature_names(features))
    return new_poly   

In [60]:
testset3_poly = changes_to_testset(testset3)

In [61]:
#this uses the new_features list from the train set selected correlations and adds them onto 
testset3[new_features] = testset3_poly[new_features]

In [62]:
testset3.head()

Unnamed: 0.1,Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Alley,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating QC,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Val,Yr Sold,MS SubClass_30,MS SubClass_40,MS SubClass_45,MS SubClass_50,MS SubClass_60,MS SubClass_70,MS SubClass_75,MS SubClass_80,MS SubClass_85,MS SubClass_90,MS SubClass_120,MS SubClass_160,MS SubClass_180,MS SubClass_190,MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM,Street_Pave,Land Contour_HLS,Land Contour_Low,Land Contour_Lvl,Condition 1_Feedr,Condition 1_Norm,Condition 1_PosA,Condition 1_PosN,Condition 1_RRAe,Condition 1_RRAn,Condition 1_RRNe,Condition 1_RRNn,Condition 2_Norm,Condition 2_PosA,Condition 2_PosN,House Style_1.5Unf,House Style_1Story,House Style_2.5Fin,House Style_2.5Unf,House Style_2Story,House Style_SFoyer,House Style_SLvl,Lot Config_CulDSac,Lot Config_FR2,Lot Config_FR3,Lot Config_Inside,Bldg Type_2fmCon,Bldg Type_Duplex,Bldg Type_Twnhs,Bldg Type_TwnhsE,Neighborhood_dummies_Blueste,Neighborhood_dummies_BrDale,Neighborhood_dummies_BrkSide,Neighborhood_dummies_ClearCr,Neighborhood_dummies_CollgCr,Neighborhood_dummies_Crawfor,Neighborhood_dummies_Edwards,Neighborhood_dummies_Gilbert,Neighborhood_dummies_Greens,Neighborhood_dummies_IDOTRR,Neighborhood_dummies_MeadowV,Neighborhood_dummies_Mitchel,Neighborhood_dummies_NAmes,Neighborhood_dummies_NPkVill,Neighborhood_dummies_NWAmes,Neighborhood_dummies_NoRidge,Neighborhood_dummies_NridgHt,Neighborhood_dummies_OldTown,Neighborhood_dummies_SWISU,Neighborhood_dummies_Sawyer,Neighborhood_dummies_SawyerW,Neighborhood_dummies_Somerst,Neighborhood_dummies_StoneBr,Neighborhood_dummies_Timber,Neighborhood_dummies_Veenker,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed,Roof Matl_Metal,Roof Matl_Roll,Roof Matl_Tar&Grv,Roof Matl_WdShake,Roof Matl_WdShngl,Exterior 1st_AsphShn,Exterior 1st_BrkComm,Exterior 1st_BrkFace,Exterior 1st_CemntBd,Exterior 1st_HdBoard,Exterior 1st_MetalSd,Exterior 1st_Plywood,Exterior 1st_Stucco,Exterior 1st_VinylSd,Exterior 1st_Wd Sdng,Exterior 1st_WdShing,Exterior 2nd_AsphShn,Exterior 2nd_Brk Cmn,Exterior 2nd_BrkFace,Exterior 2nd_CBlock,Exterior 2nd_CmentBd,Exterior 2nd_HdBoard,Exterior 2nd_ImStucc,Exterior 2nd_MetalSd,Exterior 2nd_Plywood,Exterior 2nd_Stucco,Exterior 2nd_VinylSd,Exterior 2nd_Wd Sdng,Exterior 2nd_Wd Shng,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_GasA,Heating_GasW,Heating_Grav,Central Air_Y,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD,Season_Spring,Season_Summer,Season_Winter,Mo Sold_2,Mo Sold_3,Mo Sold_4,Mo Sold_5,Mo Sold_6,Mo Sold_7,Mo Sold_8,Mo Sold_9,Mo Sold_10,Mo Sold_11,Mo Sold_12,Exter Qual 2,Exter Cond 2,Kitchen Qual 2,Garage Type_Attchd,Garage Type_Basment,Garage Type_BuiltIn,Garage Type_CarPort,Garage Type_Detchd,Garage Type_None,Alley Grvl,Alley Pave,Misc Feature_Gar2,Misc Feature_Othr,Misc Feature_Shed,Mas Vnr Type_BrkCmn,Mas Vnr Type_BrkFace,Mas Vnr Type_Stone,Overall Qual Gr Liv Area,Overall Qual^2,Overall Qual Garage Cars,Overall Qual Bsmt Qual,Overall Qual Garage Area,Bsmt Qual Gr Liv Area,Overall Qual Year Built,Overall Qual Garage Yr Blt,Overall Qual Year Remod/Add,Overall Qual Yr Sold,Overall Qual TotRms AbvGrd,Gr Liv Area Garage Cars,Overall Qual 1st Flr SF,Overall Qual Street_Pave,Garage Cars Kitchen Qual 2,Garage Area Kitchen Qual 2,Garage Cars Exter Qual 2,Gr Liv Area Kitchen Qual 2,Garage Area Exter Qual 2,Gr Liv Area Exter Qual 2,Bsmt Qual Garage Area,Bsmt Qual Garage Cars,Overall Qual Total Bsmt SF,Total Bsmt SF Garage Cars,Overall Qual Kitchen Qual 2,Overall Qual Exter Qual 2,1st Flr SF Garage Cars,TotRms AbvGrd Exter Qual 2,1st Flr SF Kitchen Qual 2,Overall Qual Full Bath,Gr Liv Area Garage Area,1st Flr SF Exter Qual 2,Total Bsmt SF Kitchen Qual 2,Bsmt Qual 1st Flr SF,Bsmt Qual Exter Qual 2,TotRms AbvGrd Kitchen Qual 2,Bsmt Qual Kitchen Qual 2,Kitchen Qual Garage Cars,Full Bath Kitchen Qual 2,Gr Liv Area Garage Finish,Garage Finish Kitchen Qual 2,Total Bsmt SF Exter Qual 2,Garage Finish Garage Area,Exter Qual 2 Kitchen Qual 2,Kitchen Qual Garage Area,Garage Finish Exter Qual 2,Full Bath Garage Area,Fireplace Qu Exter Qual 2
0,0,2658,902301120,69.0,9142,1,0,0,0,6.0,8,1910,1950,0.0,0,-1,2.0,2.5,0,1.5,0,1.5,0,1020,1020,1,3.0,908,1020,0,1928,0,0,2,0,4,2,-1,9,0,0,0.0,1910.0,1,1,440,1.5,1.5,2,0,60,112,0,0,0,0,0,0,2006,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,8874.0,36.0,12.0,15.0,2850.0,3697.5,11856.0,11856.0,12030.0,12060.0,36.0,2958.0,4350.0,6.0,2.0,475.0,2.0,1479.0,475.0,1479.0,1187.5,5.0,4350.0,1450.0,6.0,6.0,1450.0,6.0,725.0,12.0,702525.0,725.0,725.0,1812.5,2.5,6.0,2.5,2.0,2.0,2958.0,2.0,725.0,950.0,1.0,475.0,2.0,950.0,0.0
1,1,2718,905108090,79.222222,9662,0,1,0,0,7.0,4,1977,1977,0.0,0,0,3.0,2.5,0,1.5,0,1.5,0,1967,1967,0,0.0,1967,0,0,1967,0,0,2,0,6,2,0,10,0,0,0.0,1977.0,3,2,580,2.5,2.5,2,170,0,0,0,0,0,0,0,0,2006,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,14854.0,49.0,14.0,21.0,3913.0,6366.0,13972.0,13979.0,13979.0,14063.0,56.0,4244.0,6391.0,7.0,2.0,559.0,2.0,2122.0,559.0,2122.0,1677.0,6.0,6391.0,1826.0,7.0,7.0,1826.0,8.0,913.0,14.0,1186198.0,913.0,913.0,2739.0,3.0,8.0,3.0,2.0,2.0,4244.0,2.0,913.0,1118.0,1.0,559.0,2.0,1118.0,2.5
2,2,2414,528218130,58.0,17104,0,1,0,0,5.0,5,2006,2006,0.0,1,0,3.0,3.0,2,4.0,554,1.5,0,100,654,2,0.0,664,832,0,1496,1,0,2,1,3,1,1,7,0,1,3.0,2006.0,2,2,426,2.5,2.5,2,100,24,0,0,0,0,0,0,0,2006,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5285.0,25.0,5.0,12.5,1230.0,2642.5,9765.0,9765.0,10035.0,10050.0,25.0,1057.0,5285.0,5.0,1.0,246.0,0.0,1057.0,0.0,0.0,615.0,2.5,5285.0,1057.0,5.0,0.0,1057.0,0.0,1057.0,5.0,260022.0,0.0,1057.0,2642.5,0.0,5.0,2.5,1.0,1.0,1057.0,1.0,0.0,246.0,0.0,246.0,0.0,246.0,0.0
3,3,1989,902207150,60.0,8520,0,0,0,0,5.0,6,1923,2006,0.0,1,0,2.5,2.5,0,1.5,0,1.5,0,968,968,0,0.0,968,0,0,968,0,0,1,0,2,1,0,5,0,0,0.0,1935.0,1,2,480,2.0,2.5,0,0,0,184,0,0,0,0,0,0,2007,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,7220.0,25.0,10.0,15.0,2000.0,4332.0,10030.0,10035.0,10035.0,10050.0,35.0,2888.0,3720.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1200.0,6.0,1920.0,768.0,0.0,0.0,1488.0,0.0,0.0,10.0,577600.0,0.0,0.0,2232.0,0.0,0.0,0.0,0.0,0.0,4332.0,0.0,0.0,1200.0,0.0,0.0,0.0,800.0,0.0
4,4,625,535105100,75.254545,9500,0,1,0,0,6.0,5,1963,1963,247.0,0,0,3.0,2.5,0,3.0,609,1.5,0,785,1394,1,0.0,1394,0,0,1394,1,0,1,1,3,1,0,6,0,2,3.0,1963.0,2,2,514,2.5,2.5,2,0,76,0,0,185,0,0,0,0,2009,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,8670.0,36.0,12.0,12.0,2904.0,2890.0,11400.0,11742.0,11958.0,12060.0,36.0,2890.0,4986.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,968.0,4.0,4056.0,1352.0,0.0,0.0,1662.0,0.0,0.0,12.0,699380.0,0.0,0.0,1662.0,0.0,0.0,0.0,0.0,0.0,1445.0,0.0,0.0,484.0,0.0,0.0,0.0,968.0,0.0


In [63]:
testset3.shape

(879, 263)

In [64]:
X.drop('Unnamed: 0',axis=1,inplace=True)  #accidentally hit this a second time

KeyError: "['Unnamed: 0'] not found in axis"

In [65]:
testset3.drop('Unnamed: 0',axis=1,inplace=True)

In [73]:
X.shape

(2048, 262)

In [71]:
testset3.shape

(879, 262)

### Actual Submission, Model 3

In [74]:
testset3.to_csv('../datasets/test_clean_poly.csv')

In [75]:
gs_en_log_poly = gs_simple.best_estimator_

ss_test = StandardScaler()
X_sc = ss.fit_transform(X)
testset3_sc = ss.transform(testset3)

y_log = np.log(y)

gs_en_log_poly.fit(X_sc,y_log)  

predictions = gs_en_log_poly.predict(testset3_sc)



In [76]:
predictions

array([11.73541679, 12.12896269, 12.05203513, 11.54565722, 12.0385615 ,
       11.7050931 , 11.7335174 , 11.88116265, 11.79428742, 11.89316642,
       11.8558757 , 11.775183  , 11.74039709, 12.45715757, 11.65119489,
       11.90098515, 11.87528078, 11.65670309, 12.10775112, 12.08956197,
       11.98210892, 12.10128173, 12.33381761, 12.15274905, 11.87936011,
       11.86930511, 11.79729579, 11.67867531, 11.90761092, 11.38789746,
       12.063503  , 11.7076796 , 11.75875204, 11.92887618, 12.44701633,
       11.90566031, 11.63279859, 11.66797182, 11.95548763, 11.88706975,
       11.82320583, 12.64420147, 11.88934125, 11.81639554, 11.99337789,
       11.77355023, 12.08709947, 11.90997128, 11.81332501, 11.9290921 ,
       11.7191654 , 11.88485217, 12.32900784, 11.56288499, 11.76619743,
       11.9176775 , 12.01876024, 11.77638604, 11.85901645, 12.18531023,
       12.5821723 , 12.05297766, 12.03618827, 11.87765005, 11.96750422,
       12.20168176, 11.74054615, 12.19255963, 12.18060351, 12.09

In [78]:
predictions = np.exp(predictions)
predictions

array([124918.49877786, 185157.60996243, 171447.9724859 , 103327.33278304,
       169153.43764948, 121187.36575472, 124681.45498443, 144518.47811062,
       132493.30818311, 146263.69775981, 140909.86363189, 129986.12548885,
       125542.18200816, 257083.8476565 , 114828.48811187, 147411.77679606,
       143670.93337452, 115462.73209972, 181271.48845273, 178004.1191956 ,
       159868.82542675, 180102.55642019, 227252.63528331, 189614.63452358,
       144258.21144574, 142814.96413936, 132892.49720258, 118027.78139706,
       148391.73521408,  88247.21879877, 173425.43135486, 121501.22227705,
       127867.78038975, 151581.11672768, 254489.87568214, 148102.56415538,
       112735.38129627, 116771.20897699, 155669.0606297 , 145374.68863457,
       136380.74307836, 309960.91029444, 145705.28335361, 135455.10601626,
       161680.57260548, 129774.06190087, 177566.3234651 , 148742.40703842,
       135039.82507726, 151613.84965072, 122904.80582439, 145052.66566528,
       226162.22788899, 1

In [79]:
testset3['SalePrice'] = predictions

In [80]:
submission = testset3[['Id','SalePrice']]

In [81]:
submission

Unnamed: 0,Id,SalePrice
0,2658,124918.498778
1,2718,185157.609962
2,2414,171447.972486
3,1989,103327.332783
4,625,169153.437649
5,333,121187.365755
6,1327,124681.454984
7,858,144518.478111
8,95,132493.308183
9,1568,146263.697760


In [82]:
submission.to_csv('../datasets/ef_ames_12_6_C.csv',index=False)