In [2]:
import seaborn as sns
from sklearn.linear_model import LinearRegression
import warnings 
warnings.filterwarnings('ignore')

In [5]:
import acquire
import evaluate
import split_scale
import feature_selection

# ACQUIRE
### Creates a dataframe made from a SELECT query used in the Zillow database

In [6]:
df = acquire.wrangle_zillow()

In [7]:
#A selection of all the salient looking continuous variables to use in a regression model. ~7500 rows. 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7548 entries, 0 to 7568
Data columns (total 9 columns):
poolcnt                         7548 non-null float64
fireplacecnt                    7548 non-null float64
fullbathcnt                     7548 non-null float64
garagecarcnt                    7548 non-null float64
regionidcounty                  7548 non-null int64
heatingorsystemtypeid           7548 non-null float64
bedroomcnt                      7548 non-null float64
calculatedfinishedsquarefeet    7548 non-null float64
taxvaluedollarcnt               7548 non-null float64
dtypes: float64(8), int64(1)
memory usage: 589.7 KB


In [8]:
len(df)

7548

## SPLITing and SCALing

In [9]:
#Breaking off a training set to do our model fitting. Test will be stashed away until we've got a model that 
#is worth using on the test data.
train, test = split_scale.split_my_data(df)

In [10]:
train.head()
train.shape

(6038, 9)

In [11]:
test.shape

(1510, 9)

In [12]:
#One of the independent variables has a very, very different numberical range than the others, so scaling them
#into their normal scores around their mean will make analysis of their relationship to the target feature easier
scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test)

In [13]:
#I'm only using the scaled values for the feature matrix. The target variable will remain unscaled.
y_train=train[['taxvaluedollarcnt']]
X_train=train_scaled.drop('taxvaluedollarcnt' ,axis=1)

In [14]:
y_test=test[['taxvaluedollarcnt']]
X_test=test_scaled.drop('taxvaluedollarcnt',axis=1)

## FEATURE SELECTION

In [15]:
#Using RFE to iterate through all the features. Eventually an optimal number of features are selected based on
#their lower error score.
optimal_n_of_features = feature_selection.optimal_number_of_features(X_train, y_train, X_test, y_test)
optimal_n_of_features

3

In [18]:
best_features = feature_selection.optimal_features(X_train, y_train, 3)

In [19]:
best_features

Index(['garagecarcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet'], dtype='object')

## 5 MODELING AND EVALUATION

#### Model 1, Using every selected feature

In [20]:
#WE"RE GONNA MAKE TWO Lin-Reg MODELS
#ONE TRAINED WITH EVERY FEATURE, ONE WITH THE 3 OPTIMAL FEATURES

#model_1 will be the baseline, with all the features
model_1=LinearRegression()
model_1.fit(X_train,y_train)

#model_1 is now ready to make some predictions on the feature matrix it was created from
#(X_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
y_hat_1=model_1.predict(X_train)

In [22]:
model_1.coef_

array([[   6810.25478076,  -34355.68934869,   94014.69357707,
        -134076.69123173, -160565.76054141,   52516.02752835,
        -138530.74402663,  505098.81674049]])

In [None]:
model_1.intercept_

In [23]:
mse1, rmse1, r_squared1 = evaluate.regression_errors(y_train, y_hat_1)

#### Model 2, Using the optimally selected features

In [24]:
#model_2 will only use the 3 features picked by our RFE functions in its feature matrix
X_train_optimo = X_train[best_features]

In [25]:
model_2=LinearRegression()
model_2.fit(X_train_optimo,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
model_2.coef_

array([[ -35665.84176063, -124788.82618653,  553421.95815295]])

In [27]:
model_2.intercept_

array([543543.31500497])

In [28]:
y_hat_2=model_2.predict(X_train_optimo)

In [29]:
mse2, rmse2, r_squared2 = evaluate.regression_errors(y_train, y_hat_2)

In [30]:
# I HAVE THE FEELING THESE R-Squared are way too low. And why does it go down with the optimally selected ones?

In [31]:
r_squared1

array([[0.42551275]])

In [32]:
r_squared2

array([[0.41526719]])

In [33]:
mse1

(taxvaluedollarcnt    3.161381e+11
 dtype: float64,)

In [34]:
mse2

(taxvaluedollarcnt    3.217762e+11
 dtype: float64,)

## Model 3

In [35]:
#model_3 will be the baseline that was jsut those 3 suggested from the preliminary exploration
X_train.columns

Index(['poolcnt', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt',
       'regionidcounty', 'heatingorsystemtypeid', 'bedroomcnt',
       'calculatedfinishedsquarefeet'],
      dtype='object')

In [37]:
X_train_baseline = X_train[['fullbathcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']]

In [39]:
X_train_baseline.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6038 entries, 4525 to 3593
Data columns (total 3 columns):
fullbathcnt                     6038 non-null float64
bedroomcnt                      6038 non-null float64
calculatedfinishedsquarefeet    6038 non-null float64
dtypes: float64(3)
memory usage: 188.7 KB


In [41]:
model_3=LinearRegression()
model_3.fit(X_train_baseline,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [42]:
y_hat_3=model_3.predict(X_train_baseline)

In [44]:
mse3, rmse3, r_squared3 = evaluate.regression_errors(y_train, y_hat_3)

In [45]:
r_squared3

array([[0.41715443]])

In [48]:
print(f"The R^2 when using 9 features is {float(r_squared1)}")
print(f"The R^2 when using 3 'optimal' features is {float(r_squared2)}")
print(f"The R^2 when using 3 somewhat arbitrary features is {float(r_squared3)}")

The R^2 when using 9 features is 0.4255127489540933
The R^2 when using 3 'optimal' features is 0.41526719042236654
The R^2 when using 3 somewhat arbitrary features is 0.41715442614882126
