In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the training data from the csv
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


## Preprocessing

### Add in y values

In [3]:
df_y = pd.read_excel('sample_submission.xlsx', usecols=[0,1])
df = pd.merge(df, df_y, on="Id")

df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Actual_SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Drop Missing Values

In [4]:
# Drop all columns with 250 missing values
MAX_NA_PER_COLUMN = 250

df = df.dropna(axis=1, thresh=(len(df) - MAX_NA_PER_COLUMN))

# Drop all rows with missing values
df = df.dropna(axis=0, how='any')

# Drop Id column
df = df.drop(['Id'], axis=1)
df.shape

(1338, 74)

### Split X, y values

In [5]:
# split into df_x, df_y
df_x = df.drop(['Actual_SalePrice'], axis=1)
df_y = df['Actual_SalePrice']
df_x.shape, df_y.shape

((1338, 73), (1338,))

### Feature Scaling

In [6]:
# Normalize all numeric features
from sklearn.preprocessing import StandardScaler

# Find all columns with numeric types
numList = df_x.select_dtypes(include="number").columns
print(numList)
for feat in numList:
    df_x[feat] = StandardScaler().fit_transform(df_x[[feat]])

df_x.head()

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.093701,RL,-0.218363,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0.202294,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,0.146084,WD,Normal
1,-0.876298,RL,-0.107067,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,-0.731311,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-0.493561,-0.605686,WD,Normal
2,0.093701,RL,0.05262,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,-0.088501,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,0.988784,0.146084,WD,Normal
3,0.336201,RL,-0.111906,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,-0.195636,4.122511,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,-1.357457,WD,Abnorml
4,0.093701,RL,0.343926,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0.554309,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,2.100542,0.146084,WD,Normal


### Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder

# Find all columns with data type object
objList = df_x.select_dtypes(include="object").columns
print(objList)

# For each column with data type object, label encode that column
for feat in objList:
    df_x[feat] = LabelEncoder().fit_transform(df_x[feat].astype(str))

df_x.head()

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')


Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.093701,3,-0.218363,1,3,3,0,4,0,5,...,0.202294,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,0.146084,8,4
1,-0.876298,3,-0.107067,1,3,3,0,2,0,24,...,-0.731311,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-0.493561,-0.605686,8,4
2,0.093701,3,0.05262,1,0,3,0,4,0,5,...,-0.088501,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,0.988784,0.146084,8,4
3,0.336201,3,-0.111906,1,0,3,0,0,0,6,...,-0.195636,4.122511,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,-1.357457,8,0
4,0.093701,3,0.343926,1,0,3,0,2,0,15,...,0.554309,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,2.100542,0.146084,8,4


### Reduce Dimensionality with PCA

In [8]:
from sklearn.decomposition import PCA

PRESERVED_VARIANCE = 0.95 # Float 

pca = PCA(n_components=PRESERVED_VARIANCE)
df_xdim = pca.fit_transform(df_x)
pca.explained_variance_ratio_

array([0.29241579, 0.16398489, 0.07646106, 0.04232562, 0.03515401,
       0.03003716, 0.02152992, 0.02122624, 0.01939227, 0.01619046,
       0.01378553, 0.01319683, 0.01293808, 0.01215946, 0.01119967,
       0.01046089, 0.00964899, 0.00910672, 0.00875574, 0.00855145,
       0.00833408, 0.00796571, 0.00786241, 0.00767653, 0.00749069,
       0.00725448, 0.007049  , 0.00675446, 0.00663503, 0.00648307,
       0.00632413, 0.00608006, 0.00553959, 0.0053876 , 0.00512292,
       0.00489535, 0.00458651, 0.00437975, 0.00407143, 0.00389466])

### Train / Test Split

In [9]:
from sklearn.model_selection import train_test_split

# Split into random train/test sets

X_train, X_test, y_train, y_test = train_test_split(df_xdim, df_y, test_size=0.10, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1204, 40), (134, 40), (1204,), (134,))

# Individual Models

### Linear Regression

In [10]:
# Train the Model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [11]:
from sklearn.metrics import mean_squared_error

# Helper RMSE function
def rmse(model, X, y):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    return np.sqrt(mse)

In [13]:
# Compute RMSE of linear regression model
rmse(lin_reg, X_test, y_test)

28807.812585332107

### Gaussian Process

### Random Forest

### XGBoost

# Ensemble Model