# -------------------------------------
# *Regression Challenge Section*
# -------------------------------------

In [186]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion # These ones are new!
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer, Imputer

%matplotlib inline

sns.set_style('darkgrid')

### Reading in the data

In [187]:
train = pd.read_csv('train.csv')
train.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,Normal,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,Abnorml,109000


In [188]:
test = pd.read_csv('test.csv')
test.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New


### Let's do some EDA + cleaning!

In [189]:
train.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

#### A reference for the features: https://www.kaggle.com/c/dsi-west-3-project-2-regression-challenge/data

#### For this competition we are trying to predict 'SalePrice', so that's our y.

In [190]:
# First I want the column names to be uniformly formatted just for consistency.

train.rename(columns= {col: col.lower().replace(' ','_') for col in train.columns}, inplace=True)
test.rename(columns= {col: col.lower().replace(' ','_') for col in test.columns}, inplace=True)

In [191]:
train.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,Normal,220000


In [192]:
test.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD


In [193]:
# Next let's check out some high level summaries.

train.shape

(2051, 82)

In [194]:
train.dtypes.value_counts()

object     43
int64      28
float64    11
dtype: int64

In [195]:
# SO many object types, it would be awesome if we could convert those to ints or floats
# to use in our model if they are reasonable...

In [196]:
# But for now it looks like we've got quite a few incomplete features, let's see the worst offenders.

train.isnull().sum().sort_values(ascending = False)[0:10]

pool_qc          2042
misc_feature     1986
alley            1911
fence            1651
fireplace_qu     1000
lot_frontage      330
garage_cond       114
garage_finish     114
garage_yr_blt     114
garage_qual       114
dtype: int64

In [197]:
# So far dropping 'pool_qc' , 'misc_feature' , 'alley' , 'fence' since basically at least 
# 3/4 of the data is a null or missing value.

train.drop(['pool_qc', 'misc_feature', 'alley', 'fence', 'fireplace_qu'], axis=1, inplace=True)
train.shape

(2051, 77)

In [198]:
# Lazy feature hunting

nonnull_numeric_shared = (train._get_numeric_data().isnull().sum() == 0) & (test._get_numeric_data().isnull().sum() == 0)
nonnull_numeric_shared[0:6]

1st_flr_sf         True
2nd_flr_sf         True
3ssn_porch         True
bedroom_abvgr      True
bsmt_full_bath    False
bsmt_half_bath    False
dtype: bool

In [199]:
type(nonnull_numeric_shared)

pandas.core.series.Series

In [200]:
lazy_features = ['1st_flr_sf', '2nd_flr_sf', '3ssn_porch', 'bedroom_abvgr', 'enclosed_porch', 'fireplaces', 'full_bath', 'half_bath', 'id', 'kitchen_abvgr', 'lot_area', 'low_qual_fin_sf', 'misc_val', 'mo_sold', 'ms_subclass', 'open_porch_sf', 'overall_cond', 'pid', 'pool_area', 'screen_porch', 'totrms_abvgrd', 'wood_deck_sf', 'year_built', 'year_remod/add', 'yr_sold']

In [201]:
# Now let's change some of those dtypes, especially ones we can swap to ORDERED CATEGORIES...
# Let's switch exter_qual
train.exter_qual.value_counts()

TA    1247
Gd     697
Ex      81
Fa      26
Name: exter_qual, dtype: int64

In [202]:
# simple_qual_encoded = {
#     'Ex': 5,
#     'Gd': 4,
#     'TA': 3,
#     'Fa': 2,
#     'Po': 1,
#     np.nan: 0
# }
# train['exter_qual'] = train['exter_qual'].map(simple_qual_encoded)
# train['kitchen_qual'] = train['kitchen_qual'].map(simple_qual_encoded)
# train['fireplace_qu'] = train['fireplace_qu'].map(simple_qual_encoded)
# train['garage_cond'] = train['garage_cond'].map(simple_qual_encoded)
# train['garage_qual'] = train['garage_qual'].map(simple_qual_encoded)

In [203]:
train['exter_qual'] = train['exter_qual'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
train = pd.get_dummies(train, columns = ['exter_qual'], drop_first = True)
train.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice,exter_qual_Fa,exter_qual_TA,exter_qual_Gd,exter_qual_Ex
0,109,533352170,60,RL,,13517,Pave,IR1,Lvl,AllPub,...,0,3,2010,WD,Normal,130500,0,0,1,0
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,0,4,2009,WD,Normal,220000,0,0,1,0


In [204]:
train['kitchen_qual'] = train['kitchen_qual'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
train = pd.get_dummies(train, columns = ['kitchen_qual'], drop_first = True)
train.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,sale_condition,saleprice,exter_qual_Fa,exter_qual_TA,exter_qual_Gd,exter_qual_Ex,kitchen_qual_Fa,kitchen_qual_TA,kitchen_qual_Gd,kitchen_qual_Ex
0,109,533352170,60,RL,,13517,Pave,IR1,Lvl,AllPub,...,Normal,130500,0,0,1,0,0,0,1,0
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,Normal,220000,0,0,1,0,0,0,1,0


In [205]:
# train['fireplace_qu'] = train['fireplace_qu'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
# train = pd.get_dummies(train, columns = ['fireplace_qu'], drop_first = True)
# train.head(2)

In [206]:
train['garage_cond'] = train['garage_cond'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
train = pd.get_dummies(train, columns = ['garage_cond'], drop_first = True)
train.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,exter_qual_Gd,exter_qual_Ex,kitchen_qual_Fa,kitchen_qual_TA,kitchen_qual_Gd,kitchen_qual_Ex,garage_cond_Fa,garage_cond_TA,garage_cond_Gd,garage_cond_Ex
0,109,533352170,60,RL,,13517,Pave,IR1,Lvl,AllPub,...,1,0,0,0,1,0,0,1,0,0
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,1,0,0,0,1,0,0,1,0,0


In [207]:
train['garage_qual'] = train['garage_qual'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
train = pd.get_dummies(train, columns = ['garage_qual'], drop_first = True)
train.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,kitchen_qual_Gd,kitchen_qual_Ex,garage_cond_Fa,garage_cond_TA,garage_cond_Gd,garage_cond_Ex,garage_qual_Fa,garage_qual_TA,garage_qual_Gd,garage_qual_Ex
0,109,533352170,60,RL,,13517,Pave,IR1,Lvl,AllPub,...,1,0,0,1,0,0,0,1,0,0
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,1,0,0,1,0,0,0,1,0,0


In [208]:
train['neighborhood'] = train['neighborhood'].astype('category')
train = pd.get_dummies(train, columns = ['neighborhood'], drop_first = True)
train.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker
0,109,533352170,60,RL,,13517,Pave,IR1,Lvl,AllPub,...,0,0,0,0,1,0,0,0,0,0
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,0,0,0,0,0,1,0,0,0,0


In [209]:
# lot_shape_encoded = {
#     'Reg': 4,
#     'IR1': 3,
#     'IR2': 2,
#     'IR3': 1
# }

# train['lot_shape'] = train['lot_shape'].map(lot_shape_encoded)
# # train = pd.get_dummies(train, columns = ['lot_shape'], drop_first = True)
# train.head(2)

In [210]:
# train['ms_zoning'] = train['ms_zoning'].astype('category')
# train = pd.get_dummies(train, columns = ['ms_zoning'], drop_first = True)
# train.head(2)

In [211]:
train.corr()['saleprice'].abs().sort_values(ascending= False)[0:20];

In [212]:
train.isnull().sum().sort_values(ascending=False);

In [213]:
# Are they all numeric / usable / non-null for our model..?

In [214]:
train.dropna(subset = ['garage_area'], how = 'any', inplace=True)

In [215]:
train.shape

(2050, 115)

In [216]:
train.loc[train['total_bsmt_sf'] == np.nan, :]

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker


In [217]:
train.dropna(subset = ['total_bsmt_sf'], how = 'any', inplace=True)

In [218]:
train.shape

(2049, 115)

In [219]:
lazy_features

['1st_flr_sf',
 '2nd_flr_sf',
 '3ssn_porch',
 'bedroom_abvgr',
 'enclosed_porch',
 'fireplaces',
 'full_bath',
 'half_bath',
 'id',
 'kitchen_abvgr',
 'lot_area',
 'low_qual_fin_sf',
 'misc_val',
 'mo_sold',
 'ms_subclass',
 'open_porch_sf',
 'overall_cond',
 'pid',
 'pool_area',
 'screen_porch',
 'totrms_abvgrd',
 'wood_deck_sf',
 'year_built',
 'year_remod/add',
 'yr_sold']

In [245]:
# Cleaned up just top corr features
top_corr_features = ['overall_qual', 
                     #'gr_liv_area', 
                     #'garage_area', 
                     'garage_cars', 
                     #'total_bsmt_sf', 
                     'bsmt_half_bath', 
                     'bsmt_full_bath']

In [253]:
qual_features = [col for col in train.columns if ('_qual_' in col) & (col != 'low_qual_fin_sf')]
qual_features

['exter_qual_Fa',
 'exter_qual_TA',
 'exter_qual_Gd',
 'exter_qual_Ex',
 'kitchen_qual_Fa',
 'kitchen_qual_TA',
 'kitchen_qual_Gd',
 'kitchen_qual_Ex',
 'garage_qual_Fa',
 'garage_qual_TA',
 'garage_qual_Gd',
 'garage_qual_Ex']

In [222]:
# simple_qual_features = ['exter_qual', 'kitchen_qual', 'fireplace_qu', 'garage_cond', 'garage_qual']

In [223]:
neighborhood_features = [col for col in train.columns if 'neighborhood' in col]

In [224]:
# ms_zoning_features = [col for col in train.columns if 'zoning' in col]

In [225]:
# lot_shape_features = [col for col in train.columns if 'lot_shape_' in col]

In [226]:
# lot_shape_encoded_features = ['lot_shape']

In [227]:
only_top_corr_dummy_features = [
    'overall_qual',
    'gr_liv_area',
    'garage_area',
    'garage_cars',
    'total_bsmt_sf',
    '1st_flr_sf',
    'exter_qual_TA',
    'year_built',
    'kitchen_qual_Ex',
    'year_remod/add',
    'kitchen_qual_TA',
    'full_bath',
    'garage_yr_blt',
    'mas_vnr_area',
    'totrms_abvgrd',
    'exter_qual_Ex',
    'fireplaces',
    'neighborhood_NridgHt',
    'exter_qual_Gd',
    'bsmtfin_sf_1',
    'fireplace_qu_Gd',
    'lot_frontage',
    'open_porch_sf',
    'wood_deck_sf',
    'kitchen_qual_Gd',
    'lot_shape_4',
    'lot_area',
    'bsmt_full_bath',
    'half_bath',
]

In [228]:
train['total_porch_area'] = train['wood_deck_sf'] + train['open_porch_sf'] + train['enclosed_porch'] + train['3ssn_porch'] + train['screen_porch']

train.drop(['wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch', 'screen_porch'], axis=1, inplace=True)

# Seem to be quite a few dealing with area / square footage:

train['master_sq_ft'] = train['lot_area'] + train['total_bsmt_sf'] + train['low_qual_fin_sf'] + \
train['gr_liv_area'] + train['garage_area'] + train['total_porch_area'] +\
train['pool_area'] + train['mas_vnr_area'] + train['bsmtfin_sf_1'] + train['bsmtfin_sf_2'] +\
train['bsmt_unf_sf'] + train['1st_flr_sf'] + train['2nd_flr_sf']

train.drop(['lot_area','total_bsmt_sf','low_qual_fin_sf', 'gr_liv_area', 'garage_area',\
            'total_porch_area', 'pool_area', 'mas_vnr_area', 'bsmtfin_sf_1', \
            'bsmtfin_sf_2', 'bsmt_unf_sf', '1st_flr_sf', '2nd_flr_sf'], axis=1, inplace=True)

In [229]:
master_feature = ['master_sq_ft']

In [230]:
updated_features = master_feature + top_corr_features + qual_features + neighborhood_features

updated_features[0:5]

['master_sq_ft', 'overall_qual', 'gr_liv_area', 'garage_area', 'garage_cars']

In [231]:
# train.dropna(subset = ['bsmt_half_bath'], how = 'any', inplace=True)

In [232]:
train.corr()['saleprice'].abs().sort_values(ascending=False);

In [233]:
only_topcorr_features_encoded = [
    'overall_qual','exter_qual','gr_liv_area','kitchen_qual','garage_area','garage_cars','total_bsmt_sf', '1st_flr_sf', 'year_built','year_remod/add', 'fireplace_qu','full_bath','garage_yr_blt','mas_vnr_area','totrms_abvgrd'
]

In [234]:
train.isnull().sum().sort_values(ascending = False);

# Cleaning the test data in the same way

In [235]:
# test.dropna(subset = ['garage_area'], how = 'any', inplace=True)
# test.dropna(subset = ['total_bsmt_sf'], how = 'any', inplace=True)
# test.dropna(subset = ['bsmt_half_bath'], how = 'any', inplace=True)

In [236]:
# test['exter_qual'] = test['exter_qual'].map(simple_qual_encoded)
# test['kitchen_qual'] = test['kitchen_qual'].map(simple_qual_encoded)
# test['fireplace_qu'] = test['fireplace_qu'].map(simple_qual_encoded)
# test['garage_cond'] = test['garage_cond'].map(simple_qual_encoded)
# test['garage_qual'] = test['garage_qual'].map(simple_qual_encoded)
# test['lot_shape'] = test['lot_shape'].map(lot_shape_encoded)
# test = pd.get_dummies(test, columns = ['lot_shape'], drop_first = True)

In [237]:
test['exter_qual'] = test['exter_qual'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
test = pd.get_dummies(test, columns = ['exter_qual'], drop_first = True)

test['kitchen_qual'] = test['kitchen_qual'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
test = pd.get_dummies(test, columns = ['kitchen_qual'], drop_first = True)

test['fireplace_qu'] = test['fireplace_qu'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
test = pd.get_dummies(test, columns = ['fireplace_qu'], drop_first = True)

test['garage_cond'] = test['garage_cond'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
test = pd.get_dummies(test, columns = ['garage_cond'], drop_first = True)

test['garage_qual'] = test['garage_qual'].astype('category', categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)
test = pd.get_dummies(test, columns = ['garage_qual'], drop_first = True)

In [238]:
test['neighborhood'] = test['neighborhood'].astype('category')
test = pd.get_dummies(test, columns = ['neighborhood'], drop_first = True)

In [239]:
test['total_porch_area'] = test['wood_deck_sf'] + test['open_porch_sf'] + test['enclosed_porch'] + test['3ssn_porch'] + test['screen_porch']
test.drop(['wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch', 'screen_porch'], axis=1, inplace=True)

test['master_sq_ft'] = test['lot_area'] + test['total_bsmt_sf'] + test['low_qual_fin_sf'] + \
test['gr_liv_area'] + test['garage_area'] + test['total_porch_area'] +\
test['pool_area'] + test['mas_vnr_area'] + test['bsmtfin_sf_1'] + test['bsmtfin_sf_2'] +\
test['bsmt_unf_sf'] + test['1st_flr_sf'] + test['2nd_flr_sf']
test.drop(['lot_area','total_bsmt_sf','low_qual_fin_sf', 'gr_liv_area', 'garage_area',\
            'total_porch_area', 'pool_area', 'mas_vnr_area', 'bsmtfin_sf_1', \
            'bsmtfin_sf_2', 'bsmt_unf_sf', '1st_flr_sf', '2nd_flr_sf'], axis=1, inplace=True)

In [240]:
# test['ms_zoning'] = test['ms_zoning'].astype('category')
# test = pd.get_dummies(test, columns = ['ms_zoning'], drop_first = True)
# test.head(2)

In [241]:
# (train._get_numeric_data().isnull().sum() == 0) & (test._get_numeric_data().isnull().sum() == 0)

# Let's set up and run a quickie model!

In [254]:
updated_features = master_feature + top_corr_features + qual_features + neighborhood_features

In [255]:
X = train[updated_features]
y = train['saleprice']

In [256]:
# Train data
# Holdout set (mini test)
# True test (don't show model)

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state = 42)

In [None]:
# features = FeatureUnion([
#     ('basic_features_tf', basic_features_tf),
#     ('encode_gender_tf', encode_gender_tf),
#     ('dummy_embarked_tf', dummy_embarked_tf)
# ])

# features.transform(X_train)

In [257]:
imp = Imputer()
ss = StandardScaler()

lasso = Lasso(max_iter=2500)
ridge = Ridge(max_iter=2000)
enet = ElasticNet()

pipe = Pipeline([                               # Things need to be in order like a factory
#     ('features', features),
    ('imp', imp),
#     ('poly', poly),
    ('ss', ss),
#     ('lasso', lasso),
#     ('ridge', ridge),
    ('enet', enet),
])

In [258]:
params = {
    'imp__strategy': ['mean', 'median','most_frequent'],
#     'lasso__alpha': np.arange(.001, .15, .0025),
#     'ridge__alpha': np.logspace(0, 5, 200),
    'enet__alpha' : np.arange(.01, 1.0, .005),
    'enet__l1_ratio': [.1, .5, .7,.9, .95, .99, 1]
}
rs = RandomizedSearchCV(pipe, param_distributions=params, n_iter=10)
rs.fit(X_train, y_train)
print (rs.best_score_)
print (rs.best_params_)

0.803955810951
{'imp__strategy': 'median', 'enet__l1_ratio': 0.9, 'enet__alpha': 0.68499999999999994}


In [259]:
# # Scoring it on the holdout set to make sure we're not overfitting
# # Compare to the cross_val_score

# rs.score(X_hs, y_holdout)

# # probably not great

In [260]:
test.shape

(879, 103)

In [261]:
test['neighborhood_GrnHill'] = [0 for x in range(1, 880)]
test['neighborhood_Landmrk'] = [0 for x in range(1, 880)]
test['ms_zoning_C (all)'] = [0 for x in range(1, 880)]

In [262]:
# ss = StandardScaler()
X_test = test[updated_features]
# ss.fit(X_test)
# X_test_scaled = ss.transform(X_test)
# X_test_scaled

In [263]:
predictions = rs.best_estimator_.predict(X_test)

In [264]:
test['SalePrice'] = predictions

In [265]:
test = test.rename(columns={'id': 'Id'})

In [266]:
test.head(3)

Unnamed: 0,Id,pid,ms_subclass,ms_zoning,lot_frontage,street,alley,lot_shape,land_contour,utilities,...,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker,master_sq_ft,neighborhood_GrnHill,neighborhood_Landmrk,ms_zoning_C (all),SalePrice
0,2658,902301120,190,RM,69.0,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,0,0,15650.0,0,0,0,136631.525332
1,2718,905108090,90,RL,,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,0,18280.0,0,0,0,142157.364683
2,2414,528218130,60,RL,58.0,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,0,21954.0,0,0,0,224500.248808


In [267]:
# test[['Id', 'SalePrice']].to_csv('p2_submission_1.csv', index=False)
# test[['Id', 'SalePrice']].to_csv('p2_reg_2.csv', index=False)
# test[['Id', 'SalePrice']].to_csv('p2_reg_3.csv', index=False)
# test[['Id', 'SalePrice']].to_csv('p2_reg_4.csv', index=False)
# test[['Id', 'SalePrice']].to_csv('p2_reg_5.csv', index=False)
# test[['Id', 'SalePrice']].to_csv('p2_reg_6.csv', index=False)
# test[['Id', 'SalePrice']].to_csv('p2_reg_7.csv', index=False)
# test[['Id', 'SalePrice']].to_csv('p2_reg_8.csv', index=False)
test[['Id', 'SalePrice']].to_csv('p2_reg_23.csv', index=False)

In [390]:
submission = pd.read_csv('p2_reg_9.csv')
submission.head(30)

Unnamed: 0,Id,SalePrice
0,2658,130730.379354
1,2718,189352.05346
2,2414,214767.7885
3,1989,132172.008573
4,625,185008.28323


# ----------------------------------------
# *Classification Challenge Section*
# ----------------------------------------

In [268]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [272]:
train = pd.read_csv('train.csv')
train.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,Normal,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,Abnorml,109000


In [273]:
test = pd.read_csv('test.csv')
test.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New


In [274]:
train.rename(columns= {col: col.lower().replace(' ','_') for col in train.columns}, inplace=True)
test.rename(columns= {col: col.lower().replace(' ','_') for col in test.columns}, inplace=True)

In [275]:
train['sale_condition'].value_counts(normalize=True)

Normal     0.826914
Partial    0.079961
Abnorml    0.064359
Family     0.014139
Alloca     0.009264
AdjLand    0.005363
Name: sale_condition, dtype: float64

In [276]:
# Abnormal is JUST 'Abnorml', everything else is Normal

encoded_conditon = {
    'Abnorml': 1,
    'Partial': 0,
    'AdjLand': 0,
    'Family': 0,
    'Normal': 0,
    'Alloca': 0
}
train['sale_condition'] = train['sale_condition'].map(encoded_conditon)
train.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,0,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,0,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,1,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,0,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,0,138500


In [278]:
train.sale_condition.value_counts(normalize=True)

# Our baseline is about 93.5%

0    0.935641
1    0.064359
Name: sale_condition, dtype: float64

In [279]:
nonnull_numeric_shared = (train._get_numeric_data().isnull().sum() == 0) & (test._get_numeric_data().isnull().sum() == 0)
nonnull_numeric_shared

1st_flr_sf          True
2nd_flr_sf          True
3ssn_porch          True
bedroom_abvgr       True
bsmt_full_bath     False
bsmt_half_bath     False
bsmt_unf_sf        False
bsmtfin_sf_1       False
bsmtfin_sf_2       False
enclosed_porch      True
fireplaces          True
full_bath           True
garage_area        False
garage_cars        False
garage_yr_blt      False
gr_liv_area         True
half_bath           True
id                  True
kitchen_abvgr       True
lot_area            True
lot_frontage       False
low_qual_fin_sf     True
mas_vnr_area       False
misc_val            True
mo_sold             True
ms_subclass         True
open_porch_sf       True
overall_cond        True
overall_qual        True
pid                 True
pool_area           True
sale_condition     False
saleprice          False
screen_porch        True
total_bsmt_sf      False
totrms_abvgrd       True
wood_deck_sf        True
year_built          True
year_remod/add      True
yr_sold             True


In [280]:
features = ['1st_flr_sf', '2nd_flr_sf', '3ssn_porch', 'bedroom_abvgr', 'enclosed_porch', 'fireplaces', 'full_bath', 'gr_liv_area', 'half_bath', 'id', 'kitchen_abvgr', 'lot_area', 'low_qual_fin_sf', 'misc_val', 'mo_sold', 'ms_subclass', 'open_porch_sf', 'overall_cond', 'overall_qual', 'pid', 'pool_area', 'screen_porch', 'totrms_abvgrd', 'wood_deck_sf', 'year_built', 'year_remod/add', 'yr_sold']

In [281]:
train.corr()['sale_condition'].abs().sort_values(ascending=False)[0:7]

sale_condition    1.000000
year_remod/add    0.148806
saleprice         0.145943
overall_qual      0.143246
year_built        0.132671
garage_cars       0.128731
garage_yr_blt     0.110609
Name: sale_condition, dtype: float64

In [285]:
train.bsmt_unf_sf.isnull().sum()

train.loc[train['bsmt_unf_sf'].isnull(), :]

train.shape

# Let's just drop it! YOLO!

train.dropna(subset = ['bsmt_unf_sf'], how = 'any', inplace=True)

train.shape

(2050, 82)

In [287]:
train.garage_yr_blt.isnull().sum()

# Let's get really drop happy

train.dropna(subset = ['garage_yr_blt'], how = 'any', inplace=True)

train.shape

(1936, 82)

In [288]:
train.yr_sold.isnull().sum()

corr_features = ['bsmt_unf_sf', 'overall_cond', 'garage_yr_blt', 'total_bsmt_sf', 'yr_sold']

# Model time!

In [None]:
# Alternate reality aka v2

X = train[corr_features]
y = train['sale_condition']

# First reality aka v1

X = train[features]
y = train['sale_condition']

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state = 69, stratify=y)

ss = StandardScaler()
X_ts = ss.fit_transform(X_train) #ONLY FIT TO TRAINING DATA
X_hs = ss.transform(X_holdout) #TRANSFORM BOTH

knn = KNeighborsClassifier()

cross_val_score(knn, X_ts, y_train).mean()

# So like about the same as the baseline...

params = {
    'n_neighbors': range(2,30),
    'p': [1,2],
    'weights': ['uniform', 'distance']
}
gs = GridSearchCV(knn, params)
gs.fit(X_ts, y_train)

print (gs.best_params_)
print (gs.best_score_)

# Still like the same as baseline...

# Scoring it on the holdout set to make sure we're not overfitting
# Compare to the cross_val_score

gs.score(X_hs, y_holdout)

# Slightly better?

test.head(3)

# corr_features

# test.shape

# test.garage_yr_blt.isnull().sum()

# median_garage_yr_blt = train['garage_yr_blt'].median()

# test['garage_yr_blt'] = test['garage_yr_blt'].fillna(median_garage_yr_blt)

# X_test = test[features]
# X_test_scaled = ss.transform(X_test)
# X_test_scaled

test['sale_condition'] = gs.predict(X_test_scaled)

test.head()

test = test.rename(columns={'id': 'Id', 'sale_condition': 'Sale Condition'})

test.head()

# test[['Id', 'Sale Condition']].to_csv('p2_class_1.csv', index=False)

# test[['Id', 'Sale Condition']].to_csv('p2_class_2.csv', index=False)

test[['Id', 'Sale Condition']].to_csv('p2_class_3.csv', index=False)

submission = pd.read_csv('p2_class_3.csv')
submission.head(2)
submission['Sale Condition'].value_counts()