In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
import wrangle
import f_engineer

In [2]:
# acquire the data
df = wrangle.get_zillow_project_data()

Let me get that for you...


In [3]:
df = wrangle.zillow_proj_prep(df)
# create a new dataframe with only longitude and latitude columns
location_data = df.loc[:, ['longitude', 'latitude']]
# drop longitude and latitude columns from original dataframe
df.drop(columns=['longitude', 'latitude'], inplace=True)
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,finishedsquarefeet12,fips,garagecarcnt,lotsizesquarefeet,yearbuilt,taxvaluedollarcnt,assessmentyear,logerror,transactiondate
0,3.0,4.0,2504.0,6037.0,0,6009.0,1979.0,262584.0,2016.0,0.015371,2017-01-10
1,3.0,4.0,2077.0,6037.0,0,11421.0,1979.0,950681.0,2016.0,-0.02091,2017-01-10
2,2.0,4.0,1722.0,6037.0,0,5410.0,1909.0,207351.0,2016.0,0.043068,2017-01-10
3,2.0,3.0,1242.0,6037.0,0,8328.0,1962.0,125553.0,2016.0,0.005903,2017-01-10
4,2.0,5.0,2359.0,6037.0,0,4844.0,1966.0,708879.0,2016.0,-0.056877,2017-01-10


In [4]:
# split the dataframe into train, validate, and test
train, validate, test = wrangle.split_dataframe(df)
print(train.shape)
print(validate.shape)
print(test.shape)

(29026, 11)
(12441, 11)
(10367, 11)


In [5]:
# create a subset of the dataframe with only the columns that we want to use for our analysis
train_subset = train.loc[:, ['bathroomcnt', 'bedroomcnt', 'finishedsquarefeet12', 'garagecarcnt', 'lotsizesquarefeet']]

In [6]:
df = f_engineer.scale_minmax(train_subset) # scale the data

In [7]:
train.head()

Unnamed: 0,bathroomcnt,bedroomcnt,finishedsquarefeet12,fips,garagecarcnt,lotsizesquarefeet,yearbuilt,taxvaluedollarcnt,assessmentyear,logerror,transactiondate
48363,2.0,2.0,1370.0,6037.0,0,7092.0,1951.0,529490.0,2016.0,0.030738,2017-08-25
9545,2.0,3.0,2096.0,6037.0,0,40908.0,1965.0,705000.0,2016.0,0.030276,2017-03-07
6489,3.0,2.0,1809.0,6037.0,0,7987.0,1955.0,335130.0,2016.0,-0.04642,2017-02-07
8697,5.0,6.0,3906.0,6037.0,0,16495.0,1954.0,1387000.0,2016.0,0.059611,2017-02-17
5280,2.0,3.0,1154.0,6059.0,1,3214.0,1976.0,238113.0,2016.0,-0.006345,2017-02-15


In [8]:
# create a new dataframe with only the columns we want to use for our analysis
X_train = train_subset
y_train = train.taxvaluedollarcnt

In [9]:
# Like our other sklearn objects...looks at each feature in isolation against the target based on correlation
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fe680b9d430>)

In [10]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X_train.columns)
kbest_results

Unnamed: 0,p,f
bathroomcnt,0.0,11604.673078
bedroomcnt,0.0,2462.500517
finishedsquarefeet12,0.0,18490.547467
garagecarcnt,4.561413e-24,102.57196
lotsizesquarefeet,0.0001104743,14.952806


In [11]:
X_train.columns[kbest.get_support()]

Index(['bathroomcnt', 'finishedsquarefeet12'], dtype='object')

In [12]:
from sklearn.linear_model import LinearRegression

In [18]:
# Progressively eliminate features based on importance to the model
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [14]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train.columns)

Unnamed: 0,rfe_ranking
bathroomcnt,1
bedroomcnt,1
finishedsquarefeet12,3
garagecarcnt,2
lotsizesquarefeet,4


In [15]:
X_train.columns[rfe.get_support()]

Index(['bathroomcnt', 'bedroomcnt'], dtype='object')

In [16]:
# progressively adds features based on cross validated model performance
model = LinearRegression()
sfs = SequentialFeatureSelector(model, n_features_to_select=2, scoring='neg_mean_absolute_error', direction='backward')
sfs.fit(X_train, y_train)

SequentialFeatureSelector(direction='backward', estimator=LinearRegression(),
                          n_features_to_select=2,
                          scoring='neg_mean_absolute_error')

In [17]:
X_train_transformed = pd.DataFrame(
    sfs.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[sfs.support_]
)
X_train_transformed.head()

Unnamed: 0,bedroomcnt,finishedsquarefeet12
48363,2.0,1370.0
9545,3.0,2096.0
6489,2.0,1809.0
8697,6.0,3906.0
5280,3.0,1154.0
