### Overview

This first pass will include standard linear regression models (regular, lasso, ridge, and elasticnet). However, we will keep the preprocessing steps simple by just transforming skewed variables. Whichever model has the lowest RMSE will be submitted as a first pass.

Further iterations will include more advanced variable selection and feature engineering.

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.linear_model import LinearRegression

In [57]:
# Read in the data

train = pd.read_csv("train.csv", index_col = 0)
test = pd.read_csv("test.csv", index_col = 0)

train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [58]:
# concatenate the data together

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],test.loc[:,'MSSubClass':'SaleCondition']))
print all_data.shape

(2919, 79)


In [59]:
# Check which columns have a lot of NA's in training data

#train.fillna(value=-999.0,inplace=True)
#test.fillna(value=-999.0,inplace=True)

print "Numeric Variables:"
list_feature_Nan = []
for i in train.select_dtypes(exclude=['object']).columns:
    if (train[i].isnull()).astype(int).sum() > 0:
        print i, "has", round(((train[i].isnull()).astype(int).sum()/1460.0)*100), "% of NaN"
        list_feature_Nan.append(i)
print

print "Categorical Variables:"
for i in train.select_dtypes(include=['object']).columns:
    if (train[i].isnull()).astype(int).sum() > 0:
        print i, "has", round(((train[i].isnull()).astype(int).sum()/1460.0)*100), "% of NaN"

Numeric Variables:
LotFrontage has 18.0 % of NaN
MasVnrArea has 1.0 % of NaN
GarageYrBlt has 6.0 % of NaN

Categorical Variables:
Alley has 94.0 % of NaN
MasVnrType has 1.0 % of NaN
BsmtQual has 3.0 % of NaN
BsmtCond has 3.0 % of NaN
BsmtExposure has 3.0 % of NaN
BsmtFinType1 has 3.0 % of NaN
BsmtFinType2 has 3.0 % of NaN
Electrical has 0.0 % of NaN
FireplaceQu has 47.0 % of NaN
GarageType has 6.0 % of NaN
GarageFinish has 6.0 % of NaN
GarageQual has 6.0 % of NaN
GarageCond has 6.0 % of NaN
PoolQC has 100.0 % of NaN
Fence has 81.0 % of NaN
MiscFeature has 96.0 % of NaN


In [60]:
# Define variable list for numeric variables and categorical variables for later preprocessing
non_num_cat_vars = ["MSSubclass"]
cat_vars = train.select_dtypes(include=['object']).columns.tolist()
cat_vars.append(non_num_cat_vars)
num_vars = train.select_dtypes(exclude=['object']).columns.tolist()
num_vars.remove("SalePrice")

In [61]:
# Find all numeric variables that are skewed and transform them   

# Compute skewness
skewed_vars = train[num_vars].apply(lambda x: skew(x.dropna()))
skewed_vars = skewed_vars[skewed_vars > 0.8]
skewed_vars = skewed_vars.index

all_data[skewed_vars] = np.log1p(all_data[skewed_vars])

In [69]:
from sklearn.preprocessing import scale

# For numeric variables, Impute missing values with mean
all_data[num_vars] = all_data[num_vars].fillna(all_data[num_vars].mean())

# Standardize the numeric variables


    MSSubClass MSZoning  LotFrontage   LotArea Street Alley LotShape  \
Id                                                                     
1     4.110874       RL     4.189655  9.042040   Pave   NaN      Reg   
2     3.044522       RL     4.394449  9.169623   Pave   NaN      Reg   
3     4.110874       RL     4.234107  9.328212   Pave   NaN      IR1   
4     4.262680       RL     4.110874  9.164401   Pave   NaN      IR1   
5     4.110874       RL     4.442651  9.565284   Pave   NaN      IR1   
6     3.931826       RL     4.454347  9.555064   Pave   NaN      IR1   
7     3.044522       RL     4.330733  9.218804   Pave   NaN      Reg   
8     4.110874       RL     4.196175  9.247925   Pave   NaN      IR1   
9     3.931826       RM     3.951244  8.719481   Pave   NaN      Reg   
10    5.252273       RL     3.931826  8.912069   Pave   NaN      Reg   

   LandContour Utilities LotConfig      ...       ScreenPorch PoolArea PoolQC  \
Id                                      ...           

In [77]:
df[num_vars] = df[num_vars].apply(lambda x: x - np.mean(x) / np.std(x))

In [79]:
print np.std(df["1stFlrSF"])

0.322594268532


In [64]:
# Transform the target variable (log)
train["log_SalePrice"] = np.log1p(train["SalePrice"])

In [None]:
# Figure out what to do with missing categorical variables

In [11]:
# Get dummy variables for all categorical variables

In [13]:
# Import all models from sklearn into environment

# Import all the right evaluation metrics and cross-validation modules

In [14]:
# Define a function for RMSLE (see Kaggle's formula)

In [15]:
# Test out generic LinearRegression to get baseline RMSLE

In [16]:
# Test out Lasso Regression with different values of lambda

# Find optimal lambda

# Plot results


In [17]:
# Test out Ridge Regression with different values of lambda

# Find optimal lambda
# Plot results

In [None]:
# Test out Elastic Net with different values of alpha

# Find optimal alpha

# Plot results