# Set Up

In [49]:
import pandas as pd
import numpy as np

In [50]:
# Read the data
X_full = pd.read_csv('./input/train.csv', index_col='Id')
X_test_full = pd.read_csv('./input/test.csv', index_col='Id')

# Data Preprocessing

In [51]:
X_full.head(3)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500


In [52]:
# Remove rows with missing target
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Separate target from predictors
y = X_full['SalePrice']
X_full.drop(['SalePrice'], axis=1, inplace=True)

In [53]:
# Dependent Variable Distribution
y.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

## Missing Data
I prefer to tackle missing data manually rather than include it in my scikit-learn Pipeline. Manually addressing missing data forces one to understand why variables are missing data.

In [54]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_full.columns
                     if X_full[col].isnull().any()]

# Get percent of rows in these columns with missing values 
percent_missing = (X_full[cols_with_missing].isnull().sum()/X_full.shape[0]*100)
print(percent_missing)

LotFrontage     17.739726
Alley           93.767123
MasVnrType       0.547945
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtFinType2     2.602740
Electrical       0.068493
FireplaceQu     47.260274
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
PoolQC          99.520548
Fence           80.753425
MiscFeature     96.301370
dtype: float64


In [55]:
# Get data types of each missing value variable
X_full[cols_with_missing].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotFrontage   1201 non-null   float64
 1   Alley         91 non-null     object 
 2   MasVnrType    1452 non-null   object 
 3   MasVnrArea    1452 non-null   float64
 4   BsmtQual      1423 non-null   object 
 5   BsmtCond      1423 non-null   object 
 6   BsmtExposure  1422 non-null   object 
 7   BsmtFinType1  1423 non-null   object 
 8   BsmtFinType2  1422 non-null   object 
 9   Electrical    1459 non-null   object 
 10  FireplaceQu   770 non-null    object 
 11  GarageType    1379 non-null   object 
 12  GarageYrBlt   1379 non-null   float64
 13  GarageFinish  1379 non-null   object 
 14  GarageQual    1379 non-null   object 
 15  GarageCond    1379 non-null   object 
 16  PoolQC        7 non-null      object 
 17  Fence         281 non-null    object 
 18  MiscFeature   54 non-null   

In [56]:
X_full_missing = X_full.copy()

### Object Data Types

In [57]:
# Get all columns with object Dtype
s = (X_full[cols_with_missing].dtypes == 'object')
object_cols = list(s[s].index)

# Print unique values of each object column
for col in object_cols:
    print("{}: {}".format(col, X_full[col].unique()))

Alley: [nan 'Grvl' 'Pave']
MasVnrType: ['BrkFace' 'None' 'Stone' 'BrkCmn' nan]
BsmtQual: ['Gd' 'TA' 'Ex' nan 'Fa']
BsmtCond: ['TA' 'Gd' nan 'Fa' 'Po']
BsmtExposure: ['No' 'Gd' 'Mn' 'Av' nan]
BsmtFinType1: ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
BsmtFinType2: ['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
Electrical: ['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
FireplaceQu: [nan 'TA' 'Gd' 'Fa' 'Ex' 'Po']
GarageType: ['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
GarageFinish: ['RFn' 'Unf' 'Fin' nan]
GarageQual: ['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
GarageCond: ['TA' 'Fa' nan 'Gd' 'Po' 'Ex']
PoolQC: [nan 'Ex' 'Fa' 'Gd']
Fence: [nan 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']
MiscFeature: [nan 'Shed' 'Gar2' 'Othr' 'TenC']


In [58]:
# Replace NaN for each object column
X_full_missing[object_cols] = X_full[object_cols].fillna("None")

# Check all NaN's gone for object columns
cols_with_missing = [col for col in X_full_missing.columns
                     if X_full_missing[col].isnull().any()]
X_full_missing[cols_with_missing].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   LotFrontage  1201 non-null   float64
 1   MasVnrArea   1452 non-null   float64
 2   GarageYrBlt  1379 non-null   float64
dtypes: float64(3)
memory usage: 45.6 KB


### Float Dtype

In [59]:
# Get all columns with float Dtype
s = (X_full[cols_with_missing].dtypes == 'float64')
float_cols = list(s[s].index)

# Print unique values of each object column
for col in float_cols:
    print("{}: {}".format(col, X_full[col].unique()))

LotFrontage: [ 65.  80.  68.  60.  84.  85.  75.  nan  51.  50.  70.  91.  72.  66.
 101.  57.  44. 110.  98.  47. 108. 112.  74. 115.  61.  48.  33.  52.
 100.  24.  89.  63.  76.  81.  95.  69.  21.  32.  78. 121. 122.  40.
 105.  73.  77.  64.  94.  34.  90.  55.  88.  82.  71. 120. 107.  92.
 134.  62.  86. 141.  97.  54.  41.  79. 174.  99.  67.  83.  43. 103.
  93.  30. 129. 140.  35.  37. 118.  87. 116. 150. 111.  49.  96.  59.
  36.  56. 102.  58.  38. 109. 130.  53. 137.  45. 106. 104.  42.  39.
 144. 114. 128. 149. 313. 168. 182. 138. 160. 152. 124. 153.  46.]
MasVnrArea: [1.960e+02 0.000e+00 1.620e+02 3.500e+02 1.860e+02 2.400e+02 2.860e+02
 3.060e+02 2.120e+02 1.800e+02 3.800e+02 2.810e+02 6.400e+02 2.000e+02
 2.460e+02 1.320e+02 6.500e+02 1.010e+02 4.120e+02 2.720e+02 4.560e+02
 1.031e+03 1.780e+02 5.730e+02 3.440e+02 2.870e+02 1.670e+02 1.115e+03
 4.000e+01 1.040e+02 5.760e+02 4.430e+02 4.680e+02 6.600e+01 2.200e+01
 2.840e+02 7.600e+01 2.030e+02 6.800e+01 1.830e+02 4.800

In [60]:
# Replace NaN for each float column
X_full_missing[float_cols] = X_full[float_cols].fillna(0)

# Check all NaN's gone for object columns
cols_with_missing = [col for col in X_full_missing.columns
                     if X_full_missing[col].isnull().any()]
X_full[cols_with_missing].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Empty DataFrame

## Check Categorical Data Categories
Before passing data through Pipeline, I will check for errors in categorical data categories (e.g. that 'First' and 'Frist' are not counted as two categories)

In [61]:
# Inspect Categorical Data Unique Values
s = (X_full_missing.dtypes == 'object')
object_cols = list(s[s].index)

for col in object_cols:
    print("{}: {}".format(col, X_full_missing[col].unique()))

MSZoning: ['RL' 'RM' 'C (all)' 'FV' 'RH']
Street: ['Pave' 'Grvl']
Alley: ['None' 'Grvl' 'Pave']
LotShape: ['Reg' 'IR1' 'IR2' 'IR3']
LandContour: ['Lvl' 'Bnk' 'Low' 'HLS']
Utilities: ['AllPub' 'NoSeWa']
LotConfig: ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
LandSlope: ['Gtl' 'Mod' 'Sev']
Neighborhood: ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Condition1: ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
Condition2: ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
BldgType: ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
HouseStyle: ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
RoofStyle: ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
RoofMatl: ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']
Exterior1st: [

## Scaling

In [62]:
from sklearn.preprocessing import StandardScaler

f = (X_full_missing.dtypes == 'float64')
i = (X_full_missing.dtypes == 'int64')
n = f | i
numerical_cols = list(n[n].index)

# Apply scaler to each column 
scaler = StandardScaler()
scaled_cols = pd.DataFrame(scaler.fit_transform(X_full_missing[numerical_cols]), columns=numerical_cols)

# One-hot encoding removed index; put it back
scaled_cols.index = X_full_missing.index

# Remove numerical columns (will replace with scaled)
cat_X = X_full_missing.drop(numerical_cols, axis=1)

# Add scaled columns to numerical features
X_full_scaled = pd.concat([cat_X, scaled_cols], axis=1)
X_full_scaled.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,0.351,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,-0.060731,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,0.631726,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,0.790804,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,1.698485,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777


## One Hot Encoding

In [63]:
from sklearn.preprocessing import OneHotEncoder

s = (X_full_scaled.dtypes == 'object')
object_cols = list(s[s].index)

# Apply one-hot encoder to each column with numerical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(X_full_scaled[object_cols]))
OH_cols.columns = OH_encoder.get_feature_names()

# One-hot encoding removed index; put it back
OH_cols.index = X_full_scaled.index

# Remove categorical columns (will replace with one-hot encoding)
num_X = X_full_scaled.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
X_full_OH = pd.concat([num_X, OH_cols], axis=1)
X_full_OH.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,x41_ConLw,x41_New,x41_Oth,x41_WD,x42_Abnorml,x42_AdjLand,x42_Alloca,x42_Family,x42_Normal,x42_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.073375,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.872563,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.073375,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.309859,0.068587,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.073375,0.761179,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Create Validation Data

In [64]:
# from sklearn.model_selection import train_test_split

# # Break off validation set from training data
# X_train, X_valid, y_train, y_valid = train_test_split(X_full_OH, y, train_size=0.8, test_size=0.2,
#                                                       random_state=0)

# Implement Models
Using cross-validation and grid search

In [65]:
# Classification

#from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report

# # Find cross-validated MSE of model
# def score_model(clf, param_grid, scores, X, y):
    
#     for score in scores:
#         clf = GridSearchCV(estimator = clf, scoring = '%s_macro' % score, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1)
#         clf.fit(X, y)
    
# #         print("Best parameters set found on development set:")
# #         print()
# #         print(clf.best_params_)
# #         print()
# #         print("Grid scores on development set:")
# #         print()
# #         means = clf.cv_results_['mean_test_score']
# #         stds = clf.cv_results_['std_test_score']
# #         for mean, std, params in zip(means, stds, clf.cv_results_['params']):
# #             print("%0.3f (+/-%0.03f) for %r"
# #                   % (mean, std * 2, params))
# #         print()
        
# #         print("Detailed classification report:")
# #         print()
# #         print("The model is trained on the full development set.")
# #         print("The scores are computed on the full evaluation set.")
# #         print()
# #         y_true, y_pred = y_test, clf.predict(X_test)
# #         print(classification_report(y_true, y_pred))
# #         print()

In [68]:
# Regression

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Find cross-validated MSE of model
def score_model(clf, param_grid, scores, X, y):
    
    for score in scores:
        clf = GridSearchCV(estimator = clf, scoring = score, param_grid = param_grid, 
                          cv = 3, n_jobs = -1)
        clf.fit(X, y)
    
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()
        
#         print("Detailed classification report:")
#         print()
#         print("The model is trained on the full development set.")
#         print("The scores are computed on the full evaluation set.")
#         print()
#         y_true, y_pred = y_test, clf.predict(X_test)
#         print(classification_report(y_true, y_pred))
#         print()
        return clf.best_estimator_

In [69]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

# Create the parameter grid based on the results of random search 
param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
    'n_estimators': [100]
}

scores = ['neg_mean_squared_error']

best_model = score_model(model, param_grid, scores, X_full_OH, y)

Best parameters set found on development set:

{'n_estimators': 100}

Grid scores on development set:

-944571957.830 (+/-351219682.442) for {'n_estimators': 100}



# Generate Test Predictions

In [70]:
# Fit the model to the training data
best_model.fit(X, y)

# Generate test predictions
preds_test = best_model.predict(X_test)

NameError: name 'X' is not defined

In [None]:
preds_test

## Evaluate Test Performance