In [1]:
import pandas as pd
import numpy as np
import wrangle_zillow as wrangle
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

# feature selection:
from sklearn.feature_selection import SelectKBest, RFE, f_regression


import warnings
warnings.filterwarnings("ignore")

In [2]:
zillow = pd.read_csv('zillow.csv')

In [3]:
zillow.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,taxamount,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


In [4]:
train, validate, test = wrangle.prepare_zillow(zillow)

In [5]:
train.shape, validate.shape, test.shape

((1038959, 7), (445269, 7), (371057, 7))

In [6]:
train['age'] = 2017 - train.year_built
validate['age'] = 2017 - validate.year_built
test['age'] = 2017 - test.year_built

In [7]:
train = train.sample(1000, random_state=12)
validate = validate.sample(100, random_state=12)
test = test.sample(100, random_state=12)

In [8]:
train.columns

Index(['bedrooms', 'bathrooms', 'area', 'tax_value', 'year_built', 'taxamount',
       'fips', 'age'],
      dtype='object')

In [9]:
train.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,taxamount,fips,age
1158290,3.0,2.0,1612.0,518016.0,1955.0,6567.84,6037.0,62.0
644231,4.0,2.0,1805.0,573616.0,1972.0,6843.24,6059.0,45.0
1459488,2.0,1.0,1218.0,182417.0,1951.0,2680.17,6037.0,66.0
2149989,3.0,2.5,1885.0,358469.0,1965.0,4410.72,6059.0,52.0
185697,3.0,2.0,1477.0,229000.0,1994.0,3575.09,6037.0,23.0


In [10]:
train_scaled, validate_scaled, test_scaled = wrangle.scale_data(train, validate, test, columns_to_scale=['area','age'])

In [11]:
X_train = train.drop(columns=['tax_value'])
y_train = train.tax_value
X_test = test.drop(columns=['tax_value'])
y_test = test.tax_value

In [12]:
X_train.dtypes

bedrooms      float64
bathrooms     float64
area          float64
year_built    float64
taxamount     float64
fips           object
age           float64
dtype: object

In [13]:
train.tax_value

1158290   518016.00
644231    573616.00
1459488   182417.00
2149989   358469.00
185697    229000.00
             ...   
1971825   785000.00
337324     32666.00
796028    400005.00
674210    201361.00
1042373   428769.00
Name: tax_value, Length: 1000, dtype: float64

In [14]:
train['tax_value']

1158290   518016.00
644231    573616.00
1459488   182417.00
2149989   358469.00
185697    229000.00
             ...   
1971825   785000.00
337324     32666.00
796028    400005.00
674210    201361.00
1042373   428769.00
Name: tax_value, Length: 1000, dtype: float64

In [None]:
evaluate.

In [14]:
reg = LazyRegressor(predictions=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████| 42/42 [00:19<00:00,  2.15it/s]


In [15]:
print(models)

                               Adjusted R-Squared  R-Squared        RMSE  \
Model                                                                      
GradientBoostingRegressor                    0.96       0.96    37551.85   
LGBMRegressor                                0.95       0.96    37923.18   
HistGradientBoostingRegressor                0.95       0.96    38110.74   
XGBRegressor                                 0.95       0.95    39236.42   
ExtraTreesRegressor                          0.95       0.95    39249.84   
RANSACRegressor                              0.95       0.95    39286.50   
HuberRegressor                               0.95       0.95    39870.70   
TransformedTargetRegressor                   0.95       0.95    39979.66   
LinearRegression                             0.95       0.95    39979.66   
BayesianRidge                                0.95       0.95    39979.82   
Lasso                                        0.95       0.95    39979.83   
Ridge       

In [16]:
X_train_scaled = train_scaled.drop(columns=['tax_value'])
y_train_scaled = train_scaled.tax_value
X_test_scaled = test_scaled.drop(columns=['tax_value'])
y_test_scaled = test_scaled.tax_value

In [17]:
reg = LazyRegressor(predictions=True)
models_scaled, predictions_scaled = reg.fit(X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled)

100%|████████████████████████████████████████████████████████████████| 42/42 [00:19<00:00,  2.13it/s]


In [18]:
print(models_scaled)

                               Adjusted R-Squared  R-Squared        RMSE  \
Model                                                                      
GradientBoostingRegressor                    0.96       0.96    37551.85   
LGBMRegressor                                0.95       0.96    37923.18   
HistGradientBoostingRegressor                0.95       0.96    38110.74   
XGBRegressor                                 0.95       0.95    39236.42   
ExtraTreesRegressor                          0.95       0.95    39249.84   
RANSACRegressor                              0.95       0.95    39286.50   
HuberRegressor                               0.95       0.95    39870.70   
LinearRegression                             0.95       0.95    39979.66   
TransformedTargetRegressor                   0.95       0.95    39979.66   
Lars                                         0.95       0.95    39979.66   
BayesianRidge                                0.95       0.95    39979.82   
Lasso       