# Project 2: Ames Housing Data and Kaggle Challenge

## Setup
All libraries used should be added here.

In [1]:
# import libraries
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV

## Loading Data

Import the datasets required.

In [2]:
test_id = pd.read_csv('../datasets/test.csv')

# set the input and output relative paths
input_path = '../datasets_final'
output_path = '../kaggle_submissions'
os.makedirs(output_path, exist_ok=True)

In [3]:
# write a function to load the final training data
def load_housing_train_data(filename='train_final.csv'):
    csv_train_path = os.path.join(input_path, filename)
    return pd.read_csv(csv_train_path)

# write a function to load the final test data
def load_housing_test_data(filename='test_final.csv'):
    csv_test_path = os.path.join(input_path, filename)
    return pd.read_csv(csv_test_path)

In [4]:
train_df = load_housing_train_data()
test_df = load_housing_test_data()

In [5]:
housing = train_df.copy()

## Train, Test, Split
Create predictor and target variables

In [6]:
y = housing['SalePrice']
X = housing.drop(columns = 'SalePrice')

In [7]:
# Train/test split of 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.2, 
    random_state=123)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1639, 211) (410, 211) (1639,) (410,)


## Feature Scaling

In [8]:
dummy_cols = ['Remodeled', 'IsNew', 'HasFence', 'HasPool', 'MS SubClass_20', 'MS SubClass_30', 'MS SubClass_40', 'MS SubClass_45', 'MS SubClass_50', 'MS SubClass_60', 'MS SubClass_70', 'MS SubClass_75', 'MS SubClass_80', 'MS SubClass_85', 'MS SubClass_90', 'MS SubClass_120', 'MS SubClass_150', 'MS SubClass_160', 'MS SubClass_180', 'MS SubClass_190', 'MS Zoning_A (agr)', 'MS Zoning_C (all)', 'MS Zoning_FV', 'MS Zoning_I (all)', 'MS Zoning_RH', 'MS Zoning_RL', 'MS Zoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_Grvl', 'Alley_None', 'Alley_Pave', 'Land Contour_Bnk', 'Land Contour_HLS', 'Land Contour_Low', 'Land Contour_Lvl', 'Lot Config_Corner', 'Lot Config_CulDSac', 'Lot Config_FR2', 'Lot Config_FR3', 'Lot Config_Inside', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_Greens', 'Neighborhood_GrnHill', 'Neighborhood_IDOTRR', 'Neighborhood_Landmrk', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Condition 1_Artery', 'Condition 1_Feedr', 'Condition 1_Norm', 'Condition 1_PosA', 'Condition 1_PosN', 'Condition 1_RRAe', 'Condition 1_RRAn', 'Condition 1_RRNe', 'Condition 1_RRNn', 'Condition 2_Artery', 'Condition 2_Feedr', 'Condition 2_Norm', 'Condition 2_PosA', 'Condition 2_PosN', 'Condition 2_RRAe', 'Condition 2_RRAn', 'Condition 2_RRNn', 'Bldg Type_1Fam', 'Bldg Type_2fmCon', 'Bldg Type_Duplex', 'Bldg Type_Twnhs', 'Bldg Type_TwnhsE', 'House Style_1.5Fin', 'House Style_1.5Unf', 'House Style_1Story', 'House Style_2.5Fin', 'House Style_2.5Unf', 'House Style_2Story', 'House Style_SFoyer', 'House Style_SLvl', 'Roof Style_Flat', 'Roof Style_Gable', 'Roof Style_Gambrel', 'Roof Style_Hip', 'Roof Style_Mansard', 'Roof Style_Shed', 'Roof Matl_CompShg', 'Roof Matl_Membran', 'Roof Matl_Tar&Grv', 'Roof Matl_WdShake', 'Roof Matl_WdShngl', 'Exterior 1st_AsbShng', 'Exterior 1st_AsphShn', 'Exterior 1st_BrkComm', 'Exterior 1st_BrkFace', 'Exterior 1st_CBlock', 'Exterior 1st_CemntBd', 'Exterior 1st_HdBoard', 'Exterior 1st_ImStucc', 'Exterior 1st_MetalSd', 'Exterior 1st_Plywood', 'Exterior 1st_Stone', 'Exterior 1st_Stucco', 'Exterior 1st_VinylSd', 'Exterior 1st_Wd Sdng', 'Exterior 1st_WdShing', 'Exterior 2nd_AsbShng', 'Exterior 2nd_AsphShn', 'Exterior 2nd_Brk Cmn', 'Exterior 2nd_BrkFace', 'Exterior 2nd_CBlock', 'Exterior 2nd_CmentBd', 'Exterior 2nd_HdBoard', 'Exterior 2nd_ImStucc', 'Exterior 2nd_MetalSd', 'Exterior 2nd_Plywood', 'Exterior 2nd_Stone', 'Exterior 2nd_Stucco', 'Exterior 2nd_VinylSd', 'Exterior 2nd_Wd Sdng', 'Exterior 2nd_Wd Shng', 'Mas Vnr Type_BrkCmn', 'Mas Vnr Type_BrkFace', 'Mas Vnr Type_None', 'Mas Vnr Type_Stone', 'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood', 'Heating_GasA', 'Heating_GasW', 'Heating_Grav', 'Heating_OthW', 'Heating_Wall', 'Central Air_N', 'Central Air_Y', 'Garage Type_2Types', 'Garage Type_Attchd', 'Garage Type_Basment', 'Garage Type_BuiltIn', 'Garage Type_CarPort', 'Garage Type_Detchd', 'Garage Type_None', 'Misc Feature_Gar2', 'Misc Feature_None', 'Misc Feature_Othr', 'Misc Feature_Shed', 'Misc Feature_TenC', 'Sale Type_COD', 'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD', 'Sale Type_ConLI', 'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth', 'Sale Type_WD ', 'Yr Sold_2006', 'Yr Sold_2007', 'Yr Sold_2008', 'Yr Sold_2009', 'Yr Sold_2010', 'Mo Sold_1', 'Mo Sold_2', 'Mo Sold_3', 'Mo Sold_4', 'Mo Sold_5', 'Mo Sold_6', 'Mo Sold_7', 'Mo Sold_8', 'Mo Sold_9', 'Mo Sold_10', 'Mo Sold_11', 'Mo Sold_12']

In [9]:
scale_cols = [c for c in test_df.columns if c not in dummy_cols]

In [10]:
# Make a copy
# Standard scaler
sc = StandardScaler()

# Make a copy
Z_train = X_train.copy()
Z_test = X_test.copy()
Z_test_df = test_df.copy()

Z_train[scale_cols] = sc.fit_transform(X_train[scale_cols])
Z_test[scale_cols] = sc.transform(X_test[scale_cols])
Z_test_df[scale_cols] = sc.transform(test_df[scale_cols])

### Ridge Regression Submission

In [11]:
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.
r_alpha = np.logspace(0,5,100)

# Instantiate model.
ridge_cv = RidgeCV(alphas = r_alpha, scoring= 'neg_root_mean_squared_error' , cv= 5)

# Fit model using optimal alpha.
ridge_cv = ridge_cv.fit(Z_train, y_train)

In [12]:
# Here is the optimal value of alpha
ridge_cv.alpha_

23.101297000831593

In [13]:
# Generate predictions
y_pred_ridge_tuned =  ridge_cv.predict(Z_test_df)

In [14]:
submission_ridge = pd.DataFrame({'Id': test_id['Id'],
                           'SalePrice': np.exp(y_pred_ridge_tuned)})

submission_ridge.to_csv(output_path + "/submission_ridge.csv", index=False)

In [15]:
submission_ridge

Unnamed: 0,Id,SalePrice
0,2658,124032.884982
1,2718,148397.291837
2,2414,212566.703668
3,1989,105095.182844
4,625,169438.545540
...,...,...
873,1662,175104.937809
874,1234,209250.116256
875,1373,123005.301044
876,1672,111916.351669


### Lasso Regression Submission

In [16]:
# Instantiate model
lasso_cv = LassoCV(n_alphas=100, cv=5, max_iter=30000)

# Fit model using optimal alpha.
lasso_cv = lasso_cv.fit(Z_train, y_train)

In [17]:
# Here is the optimal value of alpha
lasso_cv.alpha_

0.000787466392895743

In [18]:
# Generate predictions
y_pred_lasso_tuned =  lasso_cv.predict(Z_test_df)

In [19]:
submission_lasso = pd.DataFrame({'Id': test_id['Id'],
                           'SalePrice': np.exp(y_pred_lasso_tuned)})

submission_lasso.to_csv(output_path + "/submission_lasso.csv", index=False)

In [20]:
submission_lasso

Unnamed: 0,Id,SalePrice
0,2658,125769.131255
1,2718,148136.597105
2,2414,213092.058386
3,1989,106078.992001
4,625,171965.430764
...,...,...
873,1662,173846.390481
874,1234,210053.516963
875,1373,123732.111189
876,1672,110833.012985


### Kaggle Submissions

![kaggle_submission.PNG](../assets/kaggle_submission.PNG)