# Preprocessing Data

In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression, SelectPercentile
from sklearn.grid_search import GridSearchCV
np.random.seed(42)


%matplotlib inline

# Load in training data

In [4]:
ss = StandardScaler()
lr = LinearRegression()
lasso = Lasso()

In [6]:
df = pd.read_csv('../Data/train.csv')
kaggle = pd.read_csv('../Data/test.csv', index_col = 'Id')

In [7]:
df.drop(labels =182, axis = 0, inplace=True)

In [8]:
df.drop(labels =1554, axis = 0, inplace=True)

In [6]:
df.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [7]:
df_num_corr = df.corr()['SalePrice']
df_num_corr[abs(df_num_corr) > 0.5]

Overall Qual      0.800183
Year Built        0.571815
Year Remod/Add    0.550272
Mas Vnr Area      0.512203
Total Bsmt SF     0.629054
1st Flr SF        0.618512
Gr Liv Area       0.696982
Full Bath         0.537850
TotRms AbvGrd     0.503963
Garage Yr Blt     0.533886
Garage Cars       0.648445
Garage Area       0.650651
SalePrice         1.000000
Name: SalePrice, dtype: float64

In [8]:
df_int = df.select_dtypes(include = ['object'])
kaggle_int = kaggle.select_dtypes(include = ['object'])

In [9]:
df_int.columns

Index(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type'],
      dtype='object')

In [10]:
df_int_d = pd.get_dummies(df_int, drop_first=True)
kaggle_int_d = pd.get_dummies(kaggle_int, drop_first=True)

In [11]:
df_int.columns

Index(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type'],
      dtype='object')

DROP: 

Pool QC
Garage Cond or Qual
Exter Qual or Exter Cond
Condition 1
Condition 2
TotRms Abvgrd
Lot Frontage
Garage Cars
Total Bsmt Area

Simplify:
Electrical


In [12]:
df.drop(['PID', 'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type', 'BsmtFin SF 2', 'Garage Cars', 'Lot Frontage'], axis=1, inplace=True)

In [13]:
kaggle.drop(['PID', 'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type', 'BsmtFin SF 2'], axis=1, inplace=True)

In [14]:
df.columns

Index(['Id', 'MS SubClass', 'Lot Area', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1',
       'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF',
       'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath',
       'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
       'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Area',
       'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
       'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold',
       'SalePrice'],
      dtype='object')

In [15]:
df_con = pd.concat((df, df_int_d), axis=1)
df_con.fillna(0, inplace=True)
kaggle_con = pd.concat((kaggle, kaggle_int_d), axis=1)
kaggle_con.fillna(0, inplace=True)

In [16]:
df_con_corr = df_con.corr()['Pool Area'][:-1] 
golden_features_list = df_con_corr[abs(df_con_corr) > 0.5].sort_values(ascending=False)
print(golden_features_list)

Pool Area     1.000000
Pool QC_Gd    0.756112
Name: Pool Area, dtype: float64


In [17]:
df_con_corr = df_con.corr()['SalePrice'][:-1] 
golden_features_list = df_con_corr[abs(df_con_corr) > 0.5].sort_values(ascending=False)
print(golden_features_list)

SalePrice           1.000000
Overall Qual        0.800183
Gr Liv Area         0.696982
Garage Area         0.650278
Total Bsmt SF       0.629433
1st Flr SF          0.618512
Year Built          0.571815
Year Remod/Add      0.550272
Full Bath           0.537850
Foundation_PConc    0.528913
TotRms AbvGrd       0.503963
Mas Vnr Area        0.503552
Kitchen Qual_TA    -0.540736
Exter Qual_TA      -0.600259
Name: SalePrice, dtype: float64


In [18]:
y = df['SalePrice'].values
X = df_con.drop(['SalePrice'], axis=1)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [20]:
from sklearn.feature_selection import variance_threshold

# Assign X and y

# Scale the data