# Exploratory Data Analysis of Zillow Data Set

The data:

* properties_2017.csv is a sample of all properties from 2017 listed on Zillow through Sept
* train_2017.csv contains dates, propertyids, and logerror for each transaction in 2017 through Sept
* The same files are available for 2016 (entire year)
* Not all properties have transactions
* logerror=log(Zestimate)−log(SalePrice)

In [None]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import geopy


In [None]:
df17 = pd.read_csv('properties_2017.csv', low_memory=False)
df_transactions17 = pd.read_csv('train_2017.csv', low_memory=False)
df_merged17 = pd.merge(df17, df_transactions17, on='parcelid', how='right')
df_merged17.head()

In [None]:
df16 = pd.read_csv('properties_2016.csv', low_memory=False)
df_transactions16 = pd.read_csv('train_2016_v2.csv', low_memory=False)
df_merged16 = pd.merge(df16, df_transactions16, on='parcelid', how='right')
df_merged16.head()

In [None]:
df_merged16['latitude'].describe()

In [None]:
df_merged16['longitude'].describe()

The latitudes and longitudes are listed in the data sets with size decimal places but no decimal points, so they need to be corrected.

In [None]:
df_merged17['latitude'] = df_merged17['latitude'] / 1000000
df_merged17['longitude'] = df_merged17['longitude'] / 1000000
df_merged16['latitude'] = df_merged16['latitude'] / 1000000
df_merged16['longitude'] = df_merged16['longitude'] / 1000000

In [None]:
df_merged17['taxdelinquencyyear'].describe()

The tax delinquency years are listed as YY, with the first digit missing if it is a 0. Since some of the years are from the 1990s, we need to fix this so that they will sort in the correct order.

In [None]:
def convertyears(x):
    if x > 9 and x < 20:
        t = '20' + str(x)
        return float(t)
    elif x <= 9:
        t = '200' + str(x)
        return float(t)
    elif x > 20:
        t = '19' + str(x)
        return float(t)
    else:
        return np.nan
    


df_merged17['taxdelinquencyyear'] = df_merged17['taxdelinquencyyear'].map(lambda a: convertyears(a))
df_merged16['taxdelinquencyyear'] = df_merged16['taxdelinquencyyear'].map(lambda a: convertyears(a))

In [None]:
df_merged16['taxdelinquencyyear'].describe()

In [None]:
df_merged16['transactiondate'].describe()

The transaction dates are in a string format, so we need to conver them to datetime.

In [None]:
format = '%Y-%m-%d'
df_merged16['transactiondate'] = df_merged16['transactiondate'].map(lambda a: datetime.datetime.strptime(a, format))

In [None]:
df_merged17['transactiondate'] = df_merged17['transactiondate'].map(lambda a: datetime.datetime.strptime(a, format))

In [None]:
df_merged17['transactiondate'].describe()

#### For some analyses we will be looking at both data sets together

In [None]:
df_merged16['setyear'] = 2016

In [None]:
df_merged17['setyear'] = 2017

In [None]:
df_total = df_merged16.append(df_merged17, ignore_index=True)
df_total.head()

In [None]:
df_total['setyear'].value_counts()

In [None]:
df_total.groupby('setyear')['logerror'].describe()

In [None]:
means = df_total.groupby('transactiondate')['logerror'].mean()

In [None]:
plt.figure(figsize=(20,5))
plt.scatter(df_total['transactiondate'].tolist(), df_total['logerror'], s =10, c = 'blue')
plt.scatter(means.index, means, s =10, c = 'red')
plt.title('LogError Over Time')
plt.xlabel('Transaction Date')
plt.ylabel('Logerror')
plt.show()

# Missing Data and Outliers

In [None]:
missing_percents = (len(df_total.index) - df_total.count())/len(df_total.index)

In [None]:
missing_percents.sort_values(inplace=True)

In [None]:
missing_percents.plot(kind='barh', figsize=(20,30))
plt.yticks(size=20)
plt.show()

### Lets also look at this by year to make sure the two sets don't have major differences

In [None]:
missing_percents16 = (len(df_merged16.index) - df_merged16.count())/len(df_merged16.index)
missing_percents17 = (len(df_merged17.index) - df_merged17.count())/len(df_merged17.index)

In [None]:
missing_percents16.sort_values(inplace=True)
temp = pd.DataFrame(missing_percents17, columns=['2017'])
missing_combined = pd.DataFrame(missing_percents16, columns=['2016'])
missing_combined = missing_combined.join(temp)

In [None]:
missing_combined.plot.barh(figsize=(20,40))
plt.yticks(size=20)
plt.show()

# Outlier Analysis

In [None]:
categorical=['airconditioningtypeid','architecturalstyletypeid','buildingclasstypeid','decktypeid','fips',
             'hashottuborspa','heatingorsystemtypeid','propertycountylandusecode','propertylandusetypeid','propertyzoningdesc',
             'rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip',
             'storytypeid','typeconstructiontypeid','fireplaceflag','taxdelinquencyflag','censustractandblock',
             'transactiondate']
numerical = ['basementsqft','bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr',
             'finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12',
             'finishedsquarefeet13','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6',
             'fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','latitude','longitude',
             'lotsizesquarefeet','poolcnt','poolsizesum','pooltypeid10','pooltypeid2','pooltypeid7','roomcnt',
             'threequarterbathnbr','unitcnt','yardbuildingsqft17','yardbuildingsqft26','yearbuilt','numberofstories',
             'structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount',
             'taxdelinquencyyear','logerror']

# Ignore: parcelid, setyear

rooms = ['bathroomcnt','bedroomcnt','calculatedbathnbr','fullbathcnt','roomcnt','threequarterbathnbr']
sqft = ['basementsqft','finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12',
        'finishedsquarefeet13','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6',
        'garagetotalsqft']
lotsqft=['lotsizesquarefeet']
yard=['yardbuildingsqft17','yardbuildingsqft26']
pools=['poolcnt','pooltypeid10','pooltypeid2','pooltypeid7']
poolsz = 'poolsizesum'
features = ['buildingqualitytypeid','fireplacecnt','garagecarcnt','numberofstories']
units= ['unitcnt']
taxes=['structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount']
years = ['yearbuilt','assessmentyear','taxdelinquencyyear']

## 2017 Outliers

In [None]:
df_merged17.boxplot(rooms, figsize=(10,  10))
plt.show()

In [None]:
df_merged17.boxplot(sqft, figsize=(20,  10))
plt.show()

In [None]:
df_merged17.boxplot(lotsqft)

plt.show()

In [None]:
df_merged17.boxplot(pools)

plt.show()

In [None]:
df_merged17.boxplot(poolsz)

plt.show()

In [None]:
df_merged17.boxplot(features)

plt.show()

In [None]:
df_merged17.boxplot(yard)

plt.show()

In [None]:
df_merged17.boxplot(misc)

plt.show()

In [None]:
df_merged17.boxplot(taxes)

plt.show()

In [None]:
df_merged17.boxplot(years)

plt.show()

In [None]:
outliers1=df_merged17[df_merged17['calculatedbathnbr'] > 12]

In [None]:
outliers2 = df_merged17[df_merged17['unitcnt'] > 20]
outliers2

In [None]:
outliers3 = df_merged17[df_merged17['lotsizesquarefeet'] > 2000000]

In [None]:
outliers17 = outliers1.append(outliers2)
outliers17 = outliers17.append(outliers3)
pd.set_option('display.max_columns', 65)
outliers17

* For the lot size outliers, almost all of them have one of two lot sizes (3589145 or 6971010). This suggests there is something irregular with the values and in our data wrangling we will replace them with the median. 
* The outliers based on bathroom count are consistent in having outliers in most features, and are thus likely mansions in the LA area and accurately refelect the skew of the data. 
* For the unit count outliers, the property land use type id include a triplex, a quadruplex, and a mixed use building. The values are thus most likely data entry errors and we are replacing the values with the median.

## 2016 Outliers

In [None]:
df_merged16.boxplot(rooms, figsize=(10,  10))
plt.show()

In [None]:
df_merged16.boxplot(sqft, figsize=(20,  10))
plt.show()

In [None]:
df_merged16.boxplot(lotsqft)

plt.show()

In [None]:
df_merged16.boxplot(yard)

plt.show()

In [None]:
df_merged16.boxplot(pools)

plt.show()

In [None]:
df_merged16.boxplot(poolsz)

plt.show()

In [None]:
df_merged16.boxplot(features)

plt.show()

In [None]:
df_merged16.boxplot(misc)

plt.show()

In [None]:
df_merged16.boxplot(taxes)

plt.show()

In [None]:
df_merged16.boxplot(years)

plt.show()

In [None]:
outliers4=df_merged16[df_merged16['calculatedbathnbr'] > 10]
outliers4

In [None]:
outliers5 = df_merged16[df_merged16['unitcnt'] > 20]
outliers5

In [None]:
outliers6 = df_merged16[df_merged16['lotsizesquarefeet'] > 2000000]

In [None]:
outliers16 = outliers4.append(outliers5)
outliers16 = outliers16.append(outliers6)
pd.set_option('display.max_columns', 65)
outliers16

* For the lot size outliers, they again have the same two values (3589145 or 6971010). In our data wrangling we will replace them with the median. 
* For the outliers based on unit counts, the property landuse type ids are complexes of planned units (269) and condos (266). The data such as room counts appear to be about individual units, so it appears these are sales of individual units and the number of units is for the entire complex. We will replace these with the median.
* For the outliers based on bathroom count, one has irregularities in other columns such as a calculated finished square footage of 66, so we will be removing that one from the data set. 

### Check for other outliers

In [None]:
def lotsclean(x):
    if x == 3589145 or x == 6971010:
        return df_total['lotsizesquarefeet'].median()
    else:
        return x


df_total['lotsizesquarefeet'] = df_total['lotsizesquarefeet'].map(lambda a: lotsclean(a))

In [None]:
def unitsclean(x):
    if x > 20:
        return df_total['unitcnt'].median()
    else: return x


df_total['unitcnt'] = df_total['unitcnt'].map(lambda a: unitsclean(a))

In [None]:
df_total=df_total[df_total['bathroomcnt']<20]

In [None]:
df_total['propertylandusetypeid'].value_counts()

In [None]:
df_total.boxplot(rooms, figsize=(10,  10))
plt.show()

# Over and Under Estimating
Since none of the numerical values have a clear correlation with log error, we want to look to see if the relationship is non-linear by examinging both over and under estimates

(Idea from: https://www.kaggle.com/philippsp/exploratory-analysis-zillow)

In [None]:
numerical = ['bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr',
             'calculatedfinishedsquarefeet','finishedsquarefeet12',
             'fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','latitude','longitude',
             'lotsizesquarefeet','poolcnt','roomcnt','unitcnt','yearbuilt','numberofstories',
              'structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount',
             'taxdelinquencyyear','logerror']

In [None]:
sns.regplot(x=df_total['bathroomcnt'],y=df_total['logerror'])
plt.show()

In [None]:
plt.close()

In [None]:
num = df_total[numerical]

In [None]:
fig, axes = plt.subplots(round(len(num.columns) / 3), 3, figsize=(30, 60))

for i, ax in enumerate(fig.axes):
    if i < len(num):
        sns.regplot(x=num.columns[i], y=num['logerror'], data=num, ax=ax)
        #plt.title(numerical[i] + " vs logerror")
        
plt.show()  

In [None]:
fig = plt.figure(figsize=(30,60))
i = 1
for column in numerical:
    a = df_total.groupby(column)['logerror'].mean()
    sub = fig.add_subplot(9, 3, i)
    sub.scatter(a.index, a, c="g")
    sub.set_title(column + " vs logerror")
    #sub.xlabel(column)
    #sub.ylabel('logerror')
    i+=1

plt.show()

In [None]:
yr = df_total.groupby('yearbuilt')['logerror'].mean()
#plt.scatter(yr.index, yr, c="g")
sns.regplot(x=df_total['yearbuilt'],y=df_total['logerror'])
plt.show()

In [None]:
barm = df_total.groupby('calculatedbathnbr')['logerror'].mean()
plt.scatter(barm.index, barm, c="r")
plt.show()

In [None]:
berm = df_total.groupby('bedroomcnt')['logerror'].mean()
plt.scatter(berm.index, berm, c="r")
plt.show()

In [None]:
quality = df_total.groupby('buildingqualitytypeid')['logerror'].mean()
plt.scatter(quality.index, quality, c="r")
plt.show()

In [None]:
sqft = df_total.groupby('calculatedfinishedsquarefeet')['logerror'].mean()
plt.scatter(sqft.index, sqft, c="b")
plt.show()

In [None]:
fire = df_total.groupby('fireplacecnt')['logerror'].mean()
plt.scatter(fire.index, fire, c="b")
plt.show()

In [None]:
gar = df_total.groupby('garagecarcnt')['logerror'].mean()
plt.scatter(gar.index, gar, c="r")
plt.show()

In [None]:
gars = df_total.groupby('garagetotalsqft')['logerror'].mean()
plt.scatter(gars.index, gars, c="r")
plt.show()

In [None]:
lat = df_total.groupby('latitude')['logerror'].mean()
plt.scatter(lat.index, lat, c="r")
plt.show()

In [None]:
lon = df_total.groupby('longitude')['logerror'].mean()
plt.scatter(lon.index, lon, c="g")
plt.show()
             

In [None]:
lot = df_total.groupby('lotsizesquarefeet')['logerror'].mean()
plt.scatter(lot.index, lot, c="b")
plt.show()        


In [None]:
plt.scatter(df_total['poolcnt'], df_total['logerror'], c="g")
plt.show()        


In [None]:
rm = df_total.groupby('roomcnt')['logerror'].mean()
plt.scatter(rm.index, rm, c="b")
plt.show()        

In [None]:
unit = df_total.groupby('unitcnt')['logerror'].mean()
plt.scatter(unit.index, unit, c="b")
plt.show()        

In [None]:
stories = df_total.groupby('numberofstories')['logerror'].mean()
plt.scatter(stories.index, stories, c="r")
plt.show()       

In [None]:
strut = df_total.groupby('structuretaxvaluedollarcnt')['logerror'].mean()
plt.scatter(strut.index, strut, c="r")
plt.show()       

In [None]:
strut = df_total.groupby('taxvaluedollarcnt')['logerror'].mean()
plt.scatter(strut.index, strut, c="r")
plt.show()       

In [None]:
land = df_total.groupby('landtaxvaluedollarcnt')['logerror'].mean()
plt.scatter(land.index, land, c="r")
plt.show()   

In [None]:
tax = df_total.groupby('taxamount')['logerror'].mean()
plt.scatter(tax.index, tax, c="r")
plt.show()   

In [None]:
td = df_total.groupby('taxdelinquencyyear')['logerror'].mean()
plt.scatter(td.index, td, c="r")
plt.show()   

# Location Analysis

In [None]:

# #### Missing Location Information Analysis
# 
# Fields: 
# * regionidzip
# * regionidcity
# * regionidcounty
# * regionidneighborhood
# * fips
# * latitude
# * longitude

df_merged['regionidzip'].describe()


# There appears to be an invalid US zip code for the max. Examine all impossible US zip codes
temp = df_merged[df_merged['regionidzip'] > 100000]

temp['regionidzip']


# All of the entries have the same invalid zip. Look at the county the zip code is associated with.
temp['regionidcounty']


# All have the same county. Get all entries in that county

temp2 = df_merged[df_merged['regionidcounty'] == 3101]
temp2.groupby('regionidzip').count()

temp2['regionidzip'].mode()


# This is not a US zip code. In spot checking, some of these zip codes are from CA, some are from OR, and some don't exist. Look at the other region identifiers:

df_merged.groupby('regionidcounty').count()

df_merged.groupby('regionidcity').count()

df_merged.groupby('regionidneighborhood').count()
df_merged.groupby('fips').count()


# FIPS Codes:
# * 6037: LA County - count is same as county code 3101
# * 6059: Orange County - count is same as county code 1286
# * 6111: Ventura County - count is same as county code 2061
# 
# Verify mapping: 

pd.crosstab(df_merged['fips'],df_merged['regionidcounty'])


# FIPS and RegionIDCounty contain identical information. For feature selection we will use FIPS since it has real-world meaning.

pd.crosstab(df_merged['regionidneighborhood'],df_merged['fips'])

pd.crosstab(df_merged['regionidcity'],df_merged['fips'])

pd.crosstab(df_merged['regionidzip'],df_merged['fips'])


nbcorr = df_merged[df_merged['fips']==6111]


pd.crosstab(nbcorr['regionidneighborhood'],nbcorr['regionidzip'])


# Even though the zip codes are fake, they do correspong to specific collections of neighborhoods, and it thus seems likely that Zillow did a 1:1 substitution when randomizing them. Since neighborhoods are more granular, they will be more useful for analysis.

# #### Latitude and Longitude

df_merged['latitude'].describe()

df_merged['longitude'].describe()



In [None]:
a = df_total.iloc[0]
a

In [None]:
string = str(a['latitude']) + "," + str(a['longitude'])
string

In [None]:
geolocator = Nominatim()
location = geolocator.reverse(string)
location

In [None]:
location.raw['address']['postcode']

In [None]:
#t = ['latitude','longitude']
#test = df_total[t].head()

import time

def zipcalc(a, b):
    geolocator = Nominatim()
    string = str(a) + "," + str(b)
    location = geolocator.reverse(string)
    try:
        return location.raw['address']['postcode']
    except KeyError:
        return np.nan
    except GeocoderTimedOut:
        time.sleep(2)
        zipcalc(a, b)

    
#df_total['regionidzip'] = df_total.apply(lambda x: zipcalc(x['latitude'], x['longitude']), axis=1)



n = 0
for i, row in df_total.iterrows():
    n+=1
    if n == 100:
        time.sleep(1)
        n = 0
    row['regionidzip'] = zipcalc(row['latitude'], row['longitude'])



In [None]:
from pyzipcode import ZipCodeDatabase
zcdb = ZipCodeDatabase()
zcdb.find_zip(latitude=a['latitude']))


In [None]:

search = ZipcodeSearchEngine()
b = search.by_coordinate(a['latitude'],a['longitude'])
b[0].Zipcode

In [None]:
df_merged17 = df_merged17.dropna(subset=['latitude'])
df_merged17 = df_merged17.dropna(subset=['longitude'])
df_merged17['latitude'] = df_merged17['latitude'] / 1000000
df_merged17['longitude'] = df_merged17['longitude'] / 1000000

In [None]:
from uszipcode import ZipcodeSearchEngine

search = ZipcodeSearchEngine()

zips = pd.DataFrame(columns=['parcelid','calczip'])
temp = df_merged17.head()
temp['latitude']

In [None]:
for i, row in df_merged17.iterrows():
    b = search.by_coordinate(row['latitude'],row['longitude'])
    zips.loc[len(zips)] = [row['parcelid'],b[0].Zipcode] 

In [None]:
zips['calczip'].max()

In [None]:
df_merged17['poolcnt'].fillna(0, inplace=True)
df_merged17['pooltypeid10'].fillna(0, inplace=True)
df_merged17['pooltypeid2'].fillna(0, inplace=True)
df_merged17['pooltypeid7'].fillna(0, inplace=True)
df_merged17['hashottuborspa'].fillna(False, inplace=True)



In [None]:
pd.crosstab(df_merged17['pooltypeid10'],df_merged17['pooltypeid2'],df_merged17['pooltypeid7'])

In [None]:
pools=['poolcnt','pooltypeid2','pooltypeid7','pooltypeid10','hashottuborspa']
df_temp = df_merged17[pools]

In [None]:
df_temp.head()

In [None]:
df_temp2 = df_merged17[pools].dropna(subset=['poolcnt'])
df_temp2['pooltypeid10'].fillna(0, inplace=True)
df_temp2['pooltypeid2'].fillna(0, inplace=True)
df_temp2['pooltypeid7'].fillna(0, inplace=True)
df_temp2['hashottuborspa'].fillna(False, inplace=True)

In [None]:
df_temp2.head()

In [None]:
df_temp2.groupby('pooltypeid7')['pooltypeid2'].value_counts()

In [None]:
df_temp2.groupby('pooltypeid7')['pooltypeid10'].value_counts()

In [None]:
pd.crosstab(df_temp2['pooltypeid10'],df_temp2['pooltypeid2'])

In [None]:
df_temp2['pooltypeid10'].value_counts()

In [None]:
df_merged17['pooltypeid10'].value_counts()

In [None]:
df_merged17.groupby('poolcnt')['pooltypeid10'].value_counts()

In [None]:
df_merged17['pooltypeid2'].fillna(0, inplace=True)

In [None]:
df_merged17.groupby('poolcnt')['pooltypeid2'].value_counts()

In [None]:
df_merged17.groupby('poolcnt')['pooltypeid7'].value_counts()

In [None]:
df_merged17.groupby('pooltypeid7')['pooltypeid2'].value_counts()

In [None]:
df_merged17.groupby('pooltypeid2')['pooltypeid10'].value_counts()

In [None]:
df_merged17.groupby('hashottuborspa')['pooltypeid2'].value_counts()

In [None]:
df_merged17['hashottuborspa'].fillna(False, inplace=True)

In [None]:
# fill poolcnt na, fill hashottuborspa na

pools = pd.DataFrame(columns=['parcelid','pooltype'])
        
for i, row in df_merged17.iterrows():
    if row['hashottuborspa'] and row['poolcnt'] > 0:
        pools.loc[len(pools)] = [row['parcelid'],1] 
    elif not(row['hashottuborspa']) and row['poolcnt'] > 0:
        pools.loc[len(pools)] = [row['parcelid'],2] 
    elif row['hashottuborspa'] and row['poolcnt'] == 0:
        pools.loc[len(pools)] = [row['parcelid'],3] 
    else:
        pools.loc[len(pools)] = [row['parcelid'],0] 

In [None]:
#for i, row in df_merged17.iterrows():
#    if row['pooltypeid2'] = 1:
#        row['hashottuborspa'] == True
df_merged17['pooltypeid2'].fillna(0, inplace=True)
df_merged17.groupby('pooltypeid2')['hashottuborspa'].value_counts()

In [None]:
pools['pooltype'].value_counts()

In [None]:
df_merged17.groupby('propertylandusetypeid')['propertycountylandusecode'].value_counts()

In [None]:
df_merged17['propertyzoningdesc'].value_counts()