In [1]:
# Imports
from env import host, user, password
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

import wrangle as w
import explore as e
import model as m

np.random.seed(42)

In [2]:
df = w.wrangle_zillow()
train, validate, test = w.split_function(df)
to_scale = ['bathrooms', 'bedrooms', 'squarefeet', 'lotsize_sqft', 'year']
train_scaled, validate_scaled, test_scaled = w.get_minmax_scaled(train, validate, test, to_scale)

csv file found and read
Prepared df: (47949, 10)

Train: (28769, 10)
Validate: (9590, 10)
Test: (9590, 10)


In [3]:
X_columns = ['bathrooms', 'bedrooms', 'has_pool', 'squarefeet', 'lotsize_sqft', 'year', 'county_Orange', 'county_Ventura']
X_train_scaled = train_scaled[X_columns]
y_train = train_scaled.property_value
X_validate_scaled =validate_scaled[X_columns]
y_validate = validate_scaled.property_value
X_test_scaled = test_scaled[X_columns]
y_test = test_scaled.property_value

In [4]:
X_train_scaled.head()

Unnamed: 0,bathrooms,bedrooms,has_pool,squarefeet,lotsize_sqft,year,county_Orange,county_Ventura
46866,0.2,0.3,0,0.15097,0.099934,0.562044,0,0
43125,0.25,0.4,1,0.256793,0.124719,0.722628,1,0
38231,0.1,0.2,0,0.081307,0.111912,0.510949,0,0
20644,0.1,0.3,0,0.130133,0.112081,0.547445,0,0
46076,0.3,0.4,0,0.275587,0.390973,0.729927,1,0


In [5]:
# MVP: only send in squarefeet, bathrooms, bedrooms
mvp_columns = ['squarefeet', 'bathrooms', 'bedrooms']
X_train_scaled[mvp_columns].head()

Unnamed: 0,squarefeet,bathrooms,bedrooms
46866,0.15097,0.2,0.3
43125,0.256793,0.25,0.4
38231,0.081307,0.1,0.2
20644,0.130133,0.1,0.3
46076,0.275587,0.3,0.4


In [6]:
def get_reg_model_metrics_df(X_train_scaled, y_train, X_validate_scaled, y_validate
                            ,alpha=1, power=2, degrees=2):
    """
    This function will
    - accept X_train_scaled, y_train, X_validate_scaled, y_validate
    - accept values for alpha, power, and degrees; default values are 1/2/2
        - alpha is a hyperparameter for LassoLARS
        - power is a hyperparameter for Polynomial Regressioin
        - degrees is a hyperparameter for GLM
    - call multiple regression models and get metrics for each
    - return a dataframe with metrics for
        - baseline (mean)
        - OLS (Ordinary Least Squares)
        - LassoLars
        - Polynomial Regression
        - GLM (Generalized Linear Model)
    """
    # get baseline first
    RMSE_train, R2_train, RMSE_val, R2_val = m.get_baseline_train_val_metrics(y_train, y_validate)

    #initialize dataframe with results
    results_df = pd.DataFrame( data=[{'model':'baseline', 
                                      'RMSE_train': RMSE_train, 
                                      'R^2_train': R2_train,
                                      'RMSE_validate': RMSE_val,
                                      'R^2_validate': R2_val}])
    # get OLS metrics 
    RMSE_train, R2_train, RMSE_val, R2_val= m.get_ols_train_val_metrics(X_train_scaled, 
                                                                  y_train, 
                                                                  X_validate_scaled, 
                                                                  y_validate)
    results_df.loc[1] = ['ols', RMSE_train, R2_train, RMSE_val, R2_val]
    
    # get LassoLars metrics alpha=1
    RMSE_train, R2_train, RMSE_val, R2_val = m.get_lassolars_train_val_metrics(X_train_scaled, 
                                                                         y_train, 
                                                                         X_validate_scaled, 
                                                                         y_validate)
    results_df.loc[2] = ['LassoLars', RMSE_train, R2_train, RMSE_val, R2_val]
    
    # get polynomial regression metrics, degrees=2
    RMSE_train, R2_train, RMSE_val, R2_val = m.get_polynomial_train_val_metrics(X_train_scaled, 
                                                                         y_train, 
                                                                         X_validate_scaled, 
                                                                         y_validate)
    results_df.loc[3] = ['Polynomial Regression', RMSE_train, R2_train, RMSE_val, R2_val]
    
    # get GLM metrics (power = 0, alpha = 0)
    RMSE_train, R2_train, RMSE_val, R2_val = m.get_glm_train_val_metrics(X_train_scaled, 
                                                                         y_train, 
                                                                         X_validate_scaled, 
                                                                         y_validate)
    results_df.loc[4] = ['GLM', RMSE_train, R2_train, RMSE_val, R2_val]
    
    return results_df

In [7]:
m.get_reg_model_metrics_df(X_train_scaled[mvp_columns], y_train, X_validate_scaled[mvp_columns], y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,250358.676421,0.298392,251742.51915,0.315735
2,LassoLars,250359.030586,0.29839,251748.930442,0.315701
3,Polynomial Regression,249975.827966,0.300536,251443.170249,0.317362
4,GLM,250358.676421,0.298392,251742.519027,0.315735


In [8]:
m.get_reg_model_metrics_df(X_train_scaled, y_train, X_validate_scaled, y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,248287.419729,0.309953,249166.780523,0.329666
2,LassoLars,248288.423831,0.309947,249175.88693,0.329617
3,Polynomial Regression,241928.292058,0.344847,243158.906096,0.361602
4,GLM,248287.419729,0.309953,249166.780753,0.329666


In [9]:
# tried several different hyperparameters for LassoLars, Polynomial Regression, and GLM
# Default Values were best
# Best overall was Polynomial Regression with all features
# Now I'll try different sets of features

In [10]:
X_train_scaled.head()

Unnamed: 0,bathrooms,bedrooms,has_pool,squarefeet,lotsize_sqft,year,county_Orange,county_Ventura
46866,0.2,0.3,0,0.15097,0.099934,0.562044,0,0
43125,0.25,0.4,1,0.256793,0.124719,0.722628,1,0
38231,0.1,0.2,0,0.081307,0.111912,0.510949,0,0
20644,0.1,0.3,0,0.130133,0.112081,0.547445,0,0
46076,0.3,0.4,0,0.275587,0.390973,0.729927,1,0


In [11]:
test_columns = ['squarefeet', 'bathrooms', 'bedrooms', 'year']
m.get_reg_model_metrics_df(X_train_scaled[test_columns], y_train, X_validate_scaled[test_columns], y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,250321.156139,0.298602,251739.162512,0.315754
2,LassoLars,250321.688139,0.298599,251743.335184,0.315731
3,Polynomial Regression,248463.286709,0.308975,250360.391443,0.323228
4,GLM,250321.156139,0.298602,251739.162595,0.315754


In [12]:
test_columns = ['squarefeet', 'bathrooms', 'bedrooms', 'year', 'has_pool']
m.get_reg_model_metrics_df(X_train_scaled[test_columns], y_train, X_validate_scaled[test_columns], y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,250060.999717,0.300059,251288.448812,0.318202
2,LassoLars,250061.556455,0.300056,251294.515365,0.318169
3,Polynomial Regression,247969.831835,0.311717,249674.936087,0.326929
4,GLM,250060.999717,0.300059,251288.445204,0.318202


In [13]:
test_columns = ['squarefeet', 'bathrooms', 'bedrooms', 'year', 'has_pool', 'county_Orange']
m.get_reg_model_metrics_df(X_train_scaled[test_columns], y_train, X_validate_scaled[test_columns], y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,248766.222581,0.307289,249550.194526,0.327601
2,LassoLars,248766.888476,0.307285,249561.782882,0.327539
3,Polynomial Regression,243822.368308,0.334548,244579.217425,0.354123
4,GLM,248766.222581,0.307289,249550.194528,0.327601


In [14]:
test_columns = ['squarefeet', 'bathrooms', 'bedrooms', 'year', 'has_pool', 'county_Orange', 'lotsize_sqft']
m.get_reg_model_metrics_df(X_train_scaled[test_columns], y_train, X_validate_scaled[test_columns], y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,248564.0944,0.308414,249454.4369,0.328117
2,LassoLars,248564.901555,0.308409,249463.404223,0.328069
3,Polynomial Regression,243434.485825,0.336664,244554.801864,0.354252
4,GLM,248564.0944,0.308414,249454.437592,0.328117


In [15]:
test_columns = ['squarefeet', 'bathrooms', 'bedrooms', 'year', 'has_pool', 'county_Orange', 'lotsize_sqft', 'county_Ventura']
m.get_reg_model_metrics_df(X_train_scaled[test_columns], y_train, X_validate_scaled[test_columns], y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,248287.419729,0.309953,249166.780523,0.329666
2,LassoLars,248288.423831,0.309947,249175.88693,0.329617
3,Polynomial Regression,241953.760464,0.344709,243158.652921,0.361604
4,GLM,248287.419729,0.309953,249166.780753,0.329666


In [16]:
test_columns = ['squarefeet', 'bathrooms']
m.get_reg_model_metrics_df(X_train_scaled[test_columns], y_train, X_validate_scaled[test_columns], y_validate)

Unnamed: 0,model,RMSE_train,R^2_train,RMSE_validate,R^2_validate
0,baseline,298892.655016,0.0,304361.74505,-0.00021
1,ols,252884.264252,0.284165,254332.417917,0.301584
2,LassoLars,252884.326864,0.284164,254337.936606,0.301553
3,Polynomial Regression,252446.20535,0.286643,254019.228564,0.303303
4,GLM,252884.264252,0.284165,254332.417918,0.301584


# Summary of Modeling
## The best model was Polynomial Regression with degrees=2 with all feature columns sent in
- RMSE for train: 241,928
- R^2 score for train: .34
- RMSE for validate: 243,159
- R^2 score for validate: .36

## Now let's run it on test

In [17]:
m.get_polynomial_train_val_metrics(X_train_scaled, y_train, X_test_scaled, y_test)

(241928.2920576337, 0.3448468502989358, 242448.3976046919, 0.33275056274418036)

## Results of Polynomial Regression model on test data:

- RMSE for test: 242,448
- R^2 for test: .33

- This model beats the baseline
- Baseline RMSE: 298,893
- Baseline R^2: 0