In [134]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
os.chdir("/Users/yuchen/Desktop")

In [135]:
df = pd.read_csv("training.csv") #import training data set
df =pd.get_dummies(df) #create dummy variables for categoricals
Y = df.loc[:,'SalePrice'].values
df.drop(['Id', 'SalePrice'], axis=1, inplace=True) # drop first column of index IDs and Y column after dummies created

In [136]:
columnNames = df.columns #extract column names
columnNames # save column names as an array, will  use late for feature importance

Index([u'MSSubClass', u'LotFrontage', u'LotArea', u'OverallQual',
       u'OverallCond', u'YearBuilt', u'YearRemodAdd', u'MasVnrArea',
       u'BsmtFinSF1', u'BsmtFinSF2',
       ...
       u'SaleType_ConLw', u'SaleType_New', u'SaleType_Oth', u'SaleType_WD',
       u'SaleCondition_Abnorml', u'SaleCondition_AdjLand',
       u'SaleCondition_Alloca', u'SaleCondition_Family',
       u'SaleCondition_Normal', u'SaleCondition_Partial'],
      dtype='object', length=288)

In [137]:
imputer = Imputer(strategy='mean', axis=0) #constructor that calls Imputer class from sklearn
imputer.fit(df) #fit imputer function to training data
df = imputer.transform(df) #fill in missing values with mean
print(df) 

[[  6.00000000e+01   6.50000000e+01   8.45000000e+03 ...,   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  2.00000000e+01   8.00000000e+01   9.60000000e+03 ...,   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  6.00000000e+01   6.80000000e+01   1.12500000e+04 ...,   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 ..., 
 [  7.00000000e+01   6.60000000e+01   9.04200000e+03 ...,   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  2.00000000e+01   6.80000000e+01   9.71700000e+03 ...,   0.00000000e+00
    1.00000000e+00   0.00000000e+00]
 [  2.00000000e+01   7.50000000e+01   9.93700000e+03 ...,   0.00000000e+00
    1.00000000e+00   0.00000000e+00]]


In [138]:
Y.shape #check Y shape (1460 samples)
df.shape #check X shape (1460 x 289 dimensions)
if Y.shape[0] == df.shape[0]: # perform check of shapes
    print("Rows align, ready!")
else:
    print("Rows do not align- check data structure shape!")

Rows align, ready!


In [139]:
if np.isfinite(df).all():
    print("All df missing values have been imputed")
    X = df # rename df as X, since it is ready to be fed into data science models

else: # if it is not ready
    for col in range(df.shape[1]): #iterate through each column and print how many missing values there are
        print("Column {0}: {1} missing values".format(col, np.isnan(df[:,col]).sum()))

All df missing values have been imputed


In [140]:
forest = RandomForestRegressor(n_estimators=10, random_state=0)
forest.fit(X, Y)
importances = forest.feature_importances_ #generate array of feature importances
print("Raw importance array:\n")
print(importances)

Raw importance array:

[  1.10203919e-03   7.27373290e-03   1.13021768e-02   5.74403172e-01
   5.29198010e-03   1.57435370e-02   5.18839404e-03   2.41946325e-03
   2.42789800e-02   3.50285215e-04   4.87414958e-03   3.42813710e-02
   2.53613220e-02   3.68384140e-02   6.97048622e-05   1.26246173e-01
   5.04161701e-04   5.57192597e-05   2.33003296e-03   1.89773150e-03
   4.20040653e-04   8.71716632e-05   1.72217018e-02   5.27944047e-03
   6.70475039e-03   1.04545313e-02   1.54499374e-02   3.13667088e-03
   3.72736525e-03   1.18640031e-03   3.57980205e-04   1.61162615e-04
   1.39529553e-05   2.34696301e-05   3.57887439e-03   2.24500506e-03
   1.96649319e-05   1.27678313e-04   1.68935271e-05   4.75463630e-04
   1.12436730e-03   0.00000000e+00   3.23603321e-06   2.18438703e-04
   7.23815593e-05   1.64032349e-04   1.91454665e-05   7.08170462e-04
   1.99395448e-03   1.27160795e-03   8.46693202e-06   2.69515208e-05
   2.71844202e-04   0.00000000e+00   0.00000000e+00   5.51017963e-05
   1.505651

In [141]:
#check if importance array shape matches number of dimensions
len(importances) == X.shape[1]

True

In [142]:
#construct a new dataframe to hold feature importance data, NOTE: importances are percents out of 100%
feature_importance = pd.DataFrame(importances*100, index=columnNames, columns= ["Feature Importance"])
feature_importance = feature_importance.round(2) # round to 2 decimal places to make it easier to read
feature_importance = feature_importance.sort_values(by=['Feature Importance'],ascending=False) #sort descending

#cut off and print the top 10 most important values
feature_importance.head(n=10)

Unnamed: 0,Feature Importance
OverallQual,57.44
GrLivArea,12.62
2ndFlrSF,3.68
TotalBsmtSF,3.43
1stFlrSF,2.54
BsmtFinSF1,2.43
TotRmsAbvGrd,1.72
YearBuilt,1.57
GarageArea,1.54
LotArea,1.13
