In [95]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier
os.chdir("/Users/yuchen/Desktop")

In [96]:
df = pd.read_csv("training.csv") #import training data set
df =pd.get_dummies(df) #create dummy variables for categoricals
Y = df.loc[:,'SalePrice'].values
df.drop(['Id', 'SalePrice'], axis=1, inplace=True) # drop first column of index IDs and Y column after dummies created

In [97]:
columnNames = df.columns #extract column names
columnNames # save column names as an array, will  use late for feature importance

Index([u'MSSubClass', u'LotFrontage', u'LotArea', u'OverallQual',
       u'OverallCond', u'YearBuilt', u'YearRemodAdd', u'MasVnrArea',
       u'BsmtFinSF1', u'BsmtFinSF2',
       ...
       u'SaleType_ConLw', u'SaleType_New', u'SaleType_Oth', u'SaleType_WD',
       u'SaleCondition_Abnorml', u'SaleCondition_AdjLand',
       u'SaleCondition_Alloca', u'SaleCondition_Family',
       u'SaleCondition_Normal', u'SaleCondition_Partial'],
      dtype='object', length=288)

In [98]:
imputer = Imputer(strategy='mean', axis=0) #constructor that calls Imputer class from sklearn
imputer.fit(df) #fit imputer function to training data
df = imputer.transform(df) #fill in missing values with mean
print(df) 

In [99]:
Y.shape #check Y shape (1460 samples)
df.shape #check X shape (1460 x 289 dimensions)
if Y.shape[0] == df.shape[0]: # perform check of shapes
    print("Rows align, ready!")
else:
    print("Rows do not align- check data structure shape!")

Rows align, ready!


In [100]:
if np.isfinite(df).all():
    print("All df missing values have been imputed")
    X = df # rename df as X, since it is ready to be fed into data science models

else: # if it is not ready
    for col in range(df.shape[1]): #iterate through each column and print how many missing values there are
        print("Column {0}: {1} missing values".format(col, np.isnan(df[:,col]).sum()))

All df missing values have been imputed


In [132]:
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
forest.fit(X, Y)
importances = forest.feature_importances_ #generate array of feature importances
print("Raw importance array:\n")
print(importances)

Raw importance array:

[  1.07188466e-02   1.88242463e-02   2.03176127e-02   1.46001830e-02
   1.12592201e-02   1.89569571e-02   1.92601379e-02   1.30527354e-02
   1.79876902e-02   4.32012099e-03   2.09887107e-02   2.02073336e-02
   2.04000594e-02   1.28581462e-02   8.96119941e-04   2.17140955e-02
   1.08927107e-02   3.06795087e-03   7.08563072e-03   8.69385155e-03
   1.22593606e-02   1.51578899e-03   1.71412921e-02   1.05741674e-02
   1.84573655e-02   9.72277258e-03   2.00331424e-02   1.60073791e-02
   1.61180023e-02   5.61728174e-03   1.26726435e-03   4.31903773e-03
   2.46546419e-04   1.84442547e-03   2.13682908e-02   1.91968205e-02
   3.30498926e-04   1.50086412e-03   6.94411540e-04   5.02442413e-03
   3.59333862e-03   1.67119184e-04   1.43055564e-04   1.54539201e-03
   1.29468847e-03   8.62347219e-03   1.80879195e-03   4.15608575e-04
   8.64630328e-03   1.80380021e-03   1.69137838e-03   1.21547138e-03
   4.19025629e-03   1.65740175e-05   3.06690135e-05   7.81903252e-03
   3.612338

In [133]:
#check if importance array shape matches number of dimensions
len(importances) == X.shape[1]

True

In [130]:
#construct a new dataframe to hold feature importance data, NOTE: importances are percents out of 100%
feature_importance = pd.DataFrame(importances*100, index=columnNames, columns= ["Feature Importance"])
feature_importance = feature_importance.round(2) # round to 2 decimal places to make it easier to read
feature_importance = feature_importance.sort_values(by=['Feature Importance'],ascending=False) #sort descending

#cut off and print the top 10 most important values
feature_importance.head(n=10)

Unnamed: 0,Feature Importance
GrLivArea,2.17
MoSold,2.14
BsmtUnfSF,2.1
1stFlrSF,2.04
LotArea,2.03
TotalBsmtSF,2.02
GarageArea,2.0
YearRemodAdd,1.93
YrSold,1.92
YearBuilt,1.9
