In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Statistical Tests
import scipy.stats as stats
from scipy.stats import norm
# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as dates
import seaborn as sns
from sklearn.model_selection import learning_curve
import datetime
pd.options.display.float_format = '{:20,.2f}'.format
import env


In [2]:
import wrangle

from wrangle import get_zillow_data

In [3]:
df = get_zillow_data()

In [4]:
df.head()

Unnamed: 0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,bathroomcnt,...,taxdelinquencyyear,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
0,,,2.0,,,,10711855,1087254,,2.0,...,,60371132321007.0,-0.01,2017-07-07,,,,Central,,
1,,,2.0,,,1.0,10711877,1072280,,2.0,...,,60371132321007.0,0.02,2017-08-29,Central,,,Central,,
2,,,2.0,,,1.0,10711888,1340933,,2.0,...,,60371132321007.0,0.08,2017-04-04,Central,,,Central,,
3,,,2.0,,,,10711910,1878109,,2.0,...,,60371132321008.0,-0.04,2017-03-17,,,,Central,,
4,,,2.0,,,,10711923,2190858,,2.0,...,,60371132321008.0,-0.01,2017-03-24,,,,Central,,


In [5]:
df.shape

(77413, 67)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77413 entries, 0 to 77412
Data columns (total 67 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   typeconstructiontypeid        222 non-null    float64
 1   storytypeid                   50 non-null     float64
 2   heatingorsystemtypeid         49439 non-null  float64
 3   buildingclasstypeid           15 non-null     float64
 4   architecturalstyletypeid      206 non-null    float64
 5   airconditioningtypeid         24953 non-null  float64
 6   parcelid                      77413 non-null  int64  
 7   id                            77413 non-null  int64  
 8   basementsqft                  50 non-null     float64
 9   bathroomcnt                   77380 non-null  float64
 10  bedroomcnt                    77380 non-null  float64
 11  buildingqualitytypeid         49671 non-null  float64
 12  calculatedbathnbr             76771 non-null  float64
 13  d

In [7]:
df.describe()

Unnamed: 0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,logerror
count,222.0,50.0,49439.0,15.0,206.0,24953.0,77413.0,77413.0,50.0,77380.0,...,17560.0,172.0,77268.0,77379.0,77380.0,77378.0,77375.0,2886.0,77136.0,77413.0
mean,6.04,7.0,3.92,3.93,7.39,1.81,13006697.32,1495761.15,679.72,2.3,...,1.43,1.0,189315.98,490137.46,2016.0,301096.94,5995.57,14.09,60496735236339.62,0.02
std,0.56,0.0,3.59,0.26,2.73,2.97,3480712.14,861270.67,689.7,1.0,...,0.54,0.0,230088.56,653447.67,0.0,492599.03,7622.89,2.19,1535251869244.56,0.17
min,4.0,7.0,1.0,3.0,2.0,1.0,10711855.0,349.0,38.0,0.0,...,1.0,1.0,44.0,1000.0,2016.0,161.0,19.92,3.0,60371011101000.0,-4.66
25%,6.0,7.0,2.0,4.0,7.0,1.0,11538336.0,752413.0,273.0,2.0,...,1.0,1.0,84265.0,207000.0,2016.0,85504.0,2715.59,14.0,60373109005001.75,-0.02
50%,6.0,7.0,2.0,4.0,7.0,1.0,12530673.0,1498471.0,515.0,2.0,...,1.0,1.0,136499.5,358976.0,2016.0,203383.5,4450.72,15.0,60376032003008.0,0.01
75%,6.0,7.0,7.0,4.0,7.0,1.0,14211486.0,2241718.0,796.5,3.0,...,2.0,1.0,218787.5,569003.0,2016.0,366802.25,6927.8,15.0,60590423251008.0,0.04
max,13.0,7.0,24.0,4.0,21.0,13.0,167689317.0,2985182.0,3560.0,18.0,...,6.0,1.0,11421790.0,49061236.0,2016.0,48952198.0,586639.3,99.0,483030105084015.06,5.26


In [8]:
df.isnull().sum()

typeconstructiontypeid          77191
storytypeid                     77363
heatingorsystemtypeid           27974
buildingclasstypeid             77398
architecturalstyletypeid        77207
airconditioningtypeid           52460
parcelid                            0
id                                  0
basementsqft                    77363
bathroomcnt                        33
bedroomcnt                         33
buildingqualitytypeid           27742
calculatedbathnbr                 642
decktypeid                      76799
finishedfloor1squarefeet        71390
calculatedfinishedsquarefeet      229
finishedsquarefeet12             3665
finishedsquarefeet13            77372
finishedsquarefeet15            74404
finishedsquarefeet50            71390
finishedsquarefeet6             77027
fips                               33
fireplacecnt                    69137
fullbathcnt                       642
garagecarcnt                    51939
garagetotalsqft                 51939
hashottubors

In [9]:
from wrangle import clean_zillow

In [10]:
df = clean_zillow(df)

In [11]:
df.head()

Unnamed: 0_level_0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,basementsqft,bathroomcnt,bedroomcnt,decktypeid,...,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc,age_of_home,bath_pers_qft,la_county,orange_county,ventura_county
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10711855,,,2.0,,,5.0,,2.0,3.0,,...,,,Central,,,49.0,0.0,1,0,0
10711877,,,2.0,,,1.0,,2.0,4.0,,...,,,Central,,,49.0,0.0,1,0,0
10711888,,,2.0,,,1.0,,2.0,4.0,,...,,,Central,,,49.0,0.0,1,0,0
10711910,,,2.0,,,5.0,,2.0,3.0,,...,,,Central,,,61.0,0.0,1,0,0
10711923,,,2.0,,,5.0,,2.0,4.0,,...,,,Central,,,61.0,0.0,1,0,0


In [12]:
type(df)

pandas.core.frame.DataFrame

In [13]:
df.shape

(73201, 54)

In [15]:
df.describe()

Unnamed: 0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,basementsqft,bathroomcnt,bedroomcnt,decktypeid,...,fireplaceflag,taxvaluedollarcnt,taxamount,taxdelinquencyyear,logerror,age_of_home,bath_pers_qft,la_county,orange_county,ventura_county
count,222.0,50.0,73201.0,0.0,206.0,73201.0,50.0,73201.0,73201.0,597.0,...,73201.0,73201.0,73201.0,2609.0,73201.0,73201.0,73201.0,73201.0,73201.0,73201.0
mean,6.04,7.0,2.9,,7.39,3.94,679.72,2.26,3.01,66.0,...,0.11,450886.36,5533.01,14.1,0.02,51.23,0.0,0.64,0.27,0.08
std,0.56,0.0,3.23,,2.73,2.28,689.7,0.9,0.99,0.0,...,0.32,390297.09,4563.09,2.23,0.16,23.03,0.0,0.48,0.45,0.28
min,4.0,7.0,1.0,,2.0,1.0,38.0,1.0,1.0,66.0,...,0.0,1000.0,19.92,4.0,-4.66,5.0,0.0,0.0,0.0,0.0
25%,6.0,7.0,1.0,,7.0,1.0,273.0,2.0,2.0,66.0,...,0.0,205718.0,2694.27,14.0,-0.02,34.0,0.0,0.0,0.0,0.0
50%,6.0,7.0,2.0,,7.0,5.0,515.0,2.0,3.0,66.0,...,0.0,355700.0,4408.58,15.0,0.01,50.0,0.0,1.0,0.0,0.0
75%,6.0,7.0,2.0,,7.0,5.0,796.5,3.0,4.0,66.0,...,0.0,560000.0,6796.34,15.0,0.04,67.0,0.0,1.0,1.0,0.0
max,13.0,7.0,24.0,,21.0,13.0,3560.0,10.0,11.0,66.0,...,1.0,2999902.0,61992.63,99.0,3.39,143.0,0.01,1.0,1.0,1.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73201 entries, 10711855 to 167688532
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   typeconstructiontypeid        222 non-null    float64       
 1   storytypeid                   50 non-null     float64       
 2   heatingorsystemtypeid         73201 non-null  float64       
 3   buildingclasstypeid           0 non-null      float64       
 4   architecturalstyletypeid      206 non-null    float64       
 5   airconditioningtypeid         73201 non-null  float64       
 6   basementsqft                  50 non-null     float64       
 7   bathroomcnt                   73201 non-null  float64       
 8   bedroomcnt                    73201 non-null  float64       
 9   decktypeid                    597 non-null    float64       
 10  finishedfloor1squarefeet      5976 non-null   float64       
 11  calculatedfinishe

In [17]:
df.isnull().sum()

typeconstructiontypeid          72979
storytypeid                     73151
heatingorsystemtypeid               0
buildingclasstypeid             73201
architecturalstyletypeid        72995
airconditioningtypeid               0
basementsqft                    73151
bathroomcnt                         0
bedroomcnt                          0
decktypeid                      72604
finishedfloor1squarefeet        67225
calculatedfinishedsquarefeet        0
finishedsquarefeet13            73160
finishedsquarefeet15            73190
finishedsquarefeet50            67225
finishedsquarefeet6             73043
fips                                0
fireplacecnt                        0
garagecarcnt                    48099
garagetotalsqft                 48099
hashottuborspa                  71738
latitude                            0
longitude                           0
lotsizesquarefeet                   0
poolcnt                             0
poolsizesum                     72353
pooltypeid10

In [18]:
from wrangle import missing_zero_values_table

In [19]:
missing_zero_values_table(df)

Your selected dataframe has 54 columns and 73201 Rows.
There are 29 columns that have NULL values.


Unnamed: 0,Zero Values,null_count,% of Total Values,Total Zeroes + Null Values,% Total Zero + Null Values,Data Type
buildingclasstypeid,0,73201,100.0,73201,100.0,float64
buildingclassdesc,0,73201,100.0,73201,100.0,object
finishedsquarefeet15,0,73190,100.0,73190,100.0,float64
finishedsquarefeet13,0,73160,99.9,73160,99.9,float64
storydesc,0,73151,99.9,73151,99.9,object
basementsqft,0,73151,99.9,73151,99.9,float64
storytypeid,0,73151,99.9,73151,99.9,float64
yardbuildingsqft26,0,73134,99.9,73134,99.9,float64
finishedsquarefeet6,0,73043,99.8,73043,99.8,float64
architecturalstyletypeid,0,72995,99.7,72995,99.7,float64


In [20]:
from wrangle import split

In [21]:
train, validate, test = split(df)

In [22]:
train. shape

(40992, 54)

In [23]:
validate.shape

(17568, 54)

In [24]:
test.shape

(14641, 54)

In [25]:
from wrangle import seperate_y

In [26]:
X_train, y_train, X_validate, y_validate, X_test, y_test = seperate_y(train, validate, test)

KeyError: "['tax_value'] not found in axis"

In [27]:
from wrangle import scale_data

In [29]:
train_scaled, validate_scaled, test_scaled = scale_data(train, validate, test)

NameError: name 'sklearn' is not defined