In [1]:
import pandas as pd
import gmaps
import numpy as np
import re
import requests
import json


from config import gkey

In [2]:
# ZHVI All Homes (SFR, Condo/Co-op) Time Series ($) (by ZIP code)
csv_home_values = "Resources/Zip_Zhvi_AllHomes.csv"

# Median Sale Price - Seasonally Adjusted ($) (by ZIP code)
csv_home_sales = "Resources/Sale_Prices_Zip.csv"

# Monthly Home Sales (Number, Raw) (by ZIP code)
csv_number_sales = "Resources/Sale_Counts_Zip.csv"

# Foreclosure Resales (%) (by ZIP code)
csv_foreclosure_resales = "Resources/SalesPrevForeclosed_Share_Zip.csv"

# Unemployment data
csv_unemployment = "Resources/Cleaned_BLS_Unemployment-Rate-by-State_2008-2019_1-11-2020.csv"

home_values = pd.read_csv(csv_home_values, encoding="ISO-8859-1")
home_sales = pd.read_csv(csv_home_sales, encoding="ISO-8859-1")
number_sales = pd.read_csv(csv_number_sales, encoding="ISO-8859-1")
foreclosure_resales = pd.read_csv(csv_foreclosure_resales, encoding="ISO-8859-1")
unemployment = pd.read_csv(csv_unemployment)

# Suffixes:
# ZVHI = Zillow Home Value Index ($)
# MSR = Median Sales Price - Seasonally Adjusted ($)
# NS = Number of Sales
# FR = Foreclosure Resales (%)
# UNEMP = Unemployment


In [3]:
home_values["2008-Avg_ZHVI"] = home_values.iloc[:, 7:19].mean(axis=1)
home_values["2009-Avg_ZHVI"] = home_values.iloc[:, 19:31].mean(axis=1)
home_values["2010-Avg_ZHVI"] = home_values.iloc[:, 31:43].mean(axis=1)
home_values["2011-Avg_ZHVI"] = home_values.iloc[:, 43:55].mean(axis=1)
home_values["2012-Avg_ZHVI"] = home_values.iloc[:, 55:67].mean(axis=1)
home_values["2013-Avg_ZHVI"] = home_values.iloc[:, 67:79].mean(axis=1)
home_values["2014-Avg_ZHVI"] = home_values.iloc[:, 79:91].mean(axis=1)
home_values["2015-Avg_ZHVI"] = home_values.iloc[:, 91:103].mean(axis=1)
home_values["2016-Avg_ZHVI"] = home_values.iloc[:, 103:115].mean(axis=1)
home_values["2017-Avg_ZHVI"] = home_values.iloc[:, 115:127].mean(axis=1)
home_values["2018-Avg_ZHVI"] = home_values.iloc[:, 127:139].mean(axis=1)
home_values["2019-Avg_ZHVI"] = home_values.iloc[:, 139:150].mean(axis=1)

home_sales["2008-Avg_MSR"] = home_sales.iloc[:, 4:14].mean(axis=1)
home_sales["2009-Avg_MSR"] = home_sales.iloc[:, 14:26].mean(axis=1)
home_sales["2010-Avg_MSR"] = home_sales.iloc[:, 26:38].mean(axis=1)
home_sales["2011-Avg_MSR"] = home_sales.iloc[:, 38:50].mean(axis=1)
home_sales["2012-Avg_MSR"] = home_sales.iloc[:, 50:62].mean(axis=1)
home_sales["2013-Avg_MSR"] = home_sales.iloc[:, 62:74].mean(axis=1)
home_sales["2014-Avg_MSR"] = home_sales.iloc[:, 74:86].mean(axis=1)
home_sales["2015-Avg_MSR"] = home_sales.iloc[:, 86:98].mean(axis=1)
home_sales["2016-Avg_MSR"] = home_sales.iloc[:, 98:110].mean(axis=1)
home_sales["2017-Avg_MSR"] = home_sales.iloc[:, 110:122].mean(axis=1)
home_sales["2018-Avg_MSR"] = home_sales.iloc[:, 122:134].mean(axis=1)
home_sales["2019-Avg_MSR"] = home_sales.iloc[:, 134:145].mean(axis=1)

number_sales["2008-Avg_NS"] = number_sales.iloc[:, 4:14].mean(axis=1)
number_sales["2009-Avg_NS"] = number_sales.iloc[:, 14:26].mean(axis=1)
number_sales["2010-Avg_NS"] = number_sales.iloc[:, 26:38].mean(axis=1)
number_sales["2011-Avg_NS"] = number_sales.iloc[:, 38:50].mean(axis=1)
number_sales["2012-Avg_NS"] = number_sales.iloc[:, 50:62].mean(axis=1)
number_sales["2013-Avg_NS"] = number_sales.iloc[:, 62:74].mean(axis=1)
number_sales["2014-Avg_NS"] = number_sales.iloc[:, 74:86].mean(axis=1)
number_sales["2015-Avg_NS"] = number_sales.iloc[:, 86:98].mean(axis=1)
number_sales["2016-Avg_NS"] = number_sales.iloc[:, 98:110].mean(axis=1)
number_sales["2017-Avg_NS"] = number_sales.iloc[:, 110:122].mean(axis=1)
number_sales["2018-Avg_NS"] = number_sales.iloc[:, 122:134].mean(axis=1)
number_sales["2019-Avg_NS"] = number_sales.iloc[:, 134:145].mean(axis=1)

foreclosure_resales["2008-Avg_FR"] = foreclosure_resales.iloc[:, 4:14].mean(axis=1)
foreclosure_resales["2009-Avg_FR"] = foreclosure_resales.iloc[:, 14:26].mean(axis=1)
foreclosure_resales["2010-Avg_FR"] = foreclosure_resales.iloc[:, 26:38].mean(axis=1)
foreclosure_resales["2011-Avg_FR"] = foreclosure_resales.iloc[:, 38:50].mean(axis=1)
foreclosure_resales["2012-Avg_FR"] = foreclosure_resales.iloc[:, 50:62].mean(axis=1)
foreclosure_resales["2013-Avg_FR"] = foreclosure_resales.iloc[:, 62:74].mean(axis=1)
foreclosure_resales["2014-Avg_FR"] = foreclosure_resales.iloc[:, 74:86].mean(axis=1)
foreclosure_resales["2015-Avg_FR"] = foreclosure_resales.iloc[:, 86:98].mean(axis=1)
foreclosure_resales["2016-Avg_FR"] = foreclosure_resales.iloc[:, 98:110].mean(axis=1)
foreclosure_resales["2017-Avg_FR"] = foreclosure_resales.iloc[:, 110:122].mean(axis=1)
foreclosure_resales["2018-Avg_FR"] = foreclosure_resales.iloc[:, 122:134].mean(axis=1)
foreclosure_resales["2019-Avg_FR"] = foreclosure_resales.iloc[:, 134:145].mean(axis=1)

number_sales.head()

Unnamed: 0,RegionID,RegionName,StateName,SizeRank,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,...,2010-Avg_NS,2011-Avg_NS,2012-Avg_NS,2013-Avg_NS,2014-Avg_NS,2015-Avg_NS,2016-Avg_NS,2017-Avg_NS,2018-Avg_NS,2019-Avg_NS
0,61639,10025,New York,1,,,,,,,...,,,,,,,59.0,69.5,62.083333,57.363636
1,84654,60657,Illinois,2,134.0,172.0,143.0,184.0,229.0,162.0,...,109.583333,95.5,111.083333,154.75,149.416667,181.583333,190.333333,213.0,179.75,131.090909
2,61637,10023,New York,3,,,,,,,...,,,,,,,75.6,75.333333,77.5,71.909091
3,91982,77494,Texas,4,61.0,77.0,85.0,99.0,122.0,95.0,...,79.833333,98.5,123.416667,165.75,162.416667,161.166667,222.5,257.666667,253.916667,242.4
4,84616,60614,Illinois,5,,,,,,,...,,,,,,,,215.4,197.416667,153.0


In [4]:
# Python Dictionary to translate US States to Two letter codes
# Dictionary obtained from rogerallen (https://gist.github.com/rogerallen/1583593)
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

# Rename StateName columns
home_values = home_values.rename(columns={"RegionName":"ZIP Code",
                                          "2008-01":"2008-01_ZHVI",
                                          "2008-02":"2008-02_ZHVI"})
home_sales = home_sales.rename(columns={"RegionName":"ZIP Code",
                                        "StateName":"State"})
number_sales = number_sales.rename(columns={"RegionName":"ZIP Code",
                                            "StateName":"State",
                                            "seasAdj":"seasAdj_NS"})
foreclosure_resales = foreclosure_resales.rename(columns={"RegionName":"ZIP Code",
                                                          "StateName":"State"})
unemployment = unemployment.rename(columns={"State Abbreviation":"State"})

# Drop columns
unemployment.drop(columns=["Data", "Series Id", "Adjusted?", "Area",
                           "Area Type", "State/Region/Division", "Measure", "Years"])

# Translate states into two letter codes
try:
    for index, row in home_sales.iterrows():
        state = home_sales.loc[index, "State"]
        abbr = us_state_abbrev[state]
        home_sales.loc[index, "State"] = abbr

    
    for index, row in number_sales.iterrows():
        state = number_sales.loc[index, "State"]
        abbr = us_state_abbrev[state]
        number_sales.loc[index, "State"] = abbr

    for index, row in foreclosure_resales.iterrows():
        state = foreclosure_resales.loc[index, "State"]
        abbr = us_state_abbrev[state]
        foreclosure_resales.loc[index, "State"] = abbr
except:
    pass

home_values.head()

Unnamed: 0,RegionID,ZIP Code,City,State,Metro,CountyName,SizeRank,2008-01_ZHVI,2008-02_ZHVI,2008-03,...,2010-Avg_ZHVI,2011-Avg_ZHVI,2012-Avg_ZHVI,2013-Avg_ZHVI,2014-Avg_ZHVI,2015-Avg_ZHVI,2016-Avg_ZHVI,2017-Avg_ZHVI,2018-Avg_ZHVI,2019-Avg_ZHVI
0,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,1,,,,...,445775.916667,423604.722222,412033.055556,421352.583333,443598.7,452858.7,468631.8,481483.2,488620.0,483509.8
1,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,2,,,,...,836375.277778,858051.166667,888688.861111,949063.444444,1068758.0,1138190.0,1197406.0,1201872.0,1216848.0,1176796.0
2,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,3,269106.0,268326.0,267699.333333,...,274168.222222,266745.861111,271493.027778,290686.722222,315669.9,334629.9,331949.8,329479.2,332919.0,334093.0
3,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,4,,,,...,565302.194444,534765.361111,523258.083333,537427.972222,570576.1,591605.6,613205.4,628530.3,636121.4,629736.5
4,91940,77449,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,5,128064.0,127776.0,127537.666667,...,119721.722222,112142.083333,110622.833333,123323.277778,139524.0,154121.5,163667.7,169384.8,176053.0,183003.3


In [5]:
#Merge the home_values, home_sales, number_sales and foreclosure_resales
merge1 = pd.merge(home_values, home_sales, on=["ZIP Code","State","RegionID"], how="outer", suffixes=("_ZHVI", "_MSR"))
merge2 = pd.merge(number_sales, foreclosure_resales, on=["ZIP Code","State","RegionID"], how="outer", suffixes=("_NS", "_FR"))
main_data = pd.merge(merge1, merge2, on=["ZIP Code","State","RegionID"], how="outer")

In [6]:
for index, row in main_data.iterrows():
    zip_code = main_data.loc[index, "ZIP Code"]
    if len(str(zip_code)) == 4:
        zip_code = str(0) + str(zip_code)
    
    elif len(str(zip_code)) == 3:
        zip_code = str(0) + str(0) + str(zip_code)

In [7]:
main_data.head()

Unnamed: 0,RegionID,ZIP Code,City,State,Metro,CountyName,SizeRank_ZHVI,2008-01_ZHVI,2008-02_ZHVI,2008-03_ZHVI,...,2010-Avg_FR,2011-Avg_FR,2012-Avg_FR,2013-Avg_FR,2014-Avg_FR,2015-Avg_FR,2016-Avg_FR,2017-Avg_FR,2018-Avg_FR,2019-Avg_FR
0,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,1.0,,,,...,0.000633,0.00055,0.001392,0.025442,0.01695,0.003683,0.005017,0.004208,0.005367,0.0067
1,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,2.0,,,,...,,,,,,,,,,
2,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,3.0,269106.0,268326.0,267699.333333,...,0.038658,0.029625,0.029517,0.010633,0.004033,0.002458,0.003492,0.002967,0.001367,0.007025
3,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,4.0,,,,...,,,,,,,,0.00243,0.001183,0.00675
4,91940,77449,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,5.0,128064.0,127776.0,127537.666667,...,0.235167,0.206533,0.18935,0.055933,0.031283,0.013625,0.013442,0.012083,0.003492,0.01145


In [8]:
#Function to get mean of the columns from minimum month and year to maximum month and year with different suffixes
#Used regular expressions
def findavg(data,year,suffix):
    x = []
    for i in data.columns:
        y = (re.findall(f'^{year}-([0-9][0-9])_{suffix}',i))
        [x.append(int(i)) for i in y]
    min_col = data.columns.get_loc(f"{year}-{min(x):02d}_{suffix}")
    max_col = data.columns.get_loc(f"{year}-{max(x):02d}_{suffix}")
    data[f"{year}_Avg_{suffix}"] = data.iloc[:,min_col:max_col+1].mean(axis=1)
    return data[f"{year}_Avg_{suffix}"]

In [9]:
# Grabbed the 'RegionID', 'ZIP Code', 'City', 'State', 'CountyName' columns from main_data
new_main_data = main_data[['RegionID', 'ZIP Code', 'City', 'State', 'CountyName']]

new_main_data.head()

Unnamed: 0,RegionID,ZIP Code,City,State,CountyName
0,84654,60657,Chicago,IL,Cook County
1,61637,10023,New York,NY,New York County
2,91982,77494,Katy,TX,Harris County
3,84616,60614,Chicago,IL,Cook County
4,91940,77449,Katy,TX,Harris County


In [10]:
#Join all indexes across different years to new_main_data
years = [i for i in range(2008,2020,1)]
suffixes = ["ZHVI","MSR","NS","FR"]

for year in years:
    for suffix in suffixes:
        new_main_data = new_main_data.join(findavg(main_data,year,suffix))

In [11]:
new_main_data.head()
new_main_data.to_csv("Output/main_data.csv", index=False, header=True)

In [12]:
#Group the data by State and get mean acroos different zipcodes
state_data =new_main_data[
                     ['State', '2008_Avg_ZHVI','2009_Avg_ZHVI', '2010_Avg_ZHVI', '2011_Avg_ZHVI',
                      '2012_Avg_ZHVI', '2013_Avg_ZHVI', '2014_Avg_ZHVI', '2015_Avg_ZHVI', '2016_Avg_ZHVI',
                      '2017_Avg_ZHVI', '2018_Avg_ZHVI', '2019_Avg_ZHVI', '2008_Avg_MSR','2009_Avg_MSR', 
                      '2010_Avg_MSR', '2011_Avg_MSR', '2012_Avg_MSR','2013_Avg_MSR', '2014_Avg_MSR', 
                      '2015_Avg_MSR', '2016_Avg_MSR','2017_Avg_MSR', '2018_Avg_MSR', '2019_Avg_MSR', 
                      '2008_Avg_NS','2009_Avg_NS', '2010_Avg_NS', '2011_Avg_NS', '2012_Avg_NS','2013_Avg_NS', 
                      '2014_Avg_NS', '2015_Avg_NS', '2016_Avg_NS','2017_Avg_NS', '2018_Avg_NS', '2019_Avg_NS', 
                      '2008_Avg_FR', '2009_Avg_FR', '2010_Avg_FR', '2011_Avg_FR', '2012_Avg_FR','2013_Avg_FR', 
                      '2014_Avg_FR', '2015_Avg_FR', '2016_Avg_FR','2017_Avg_FR', '2018_Avg_FR', '2019_Avg_FR']
                    ].groupby("State").mean()
state_data.head()

Unnamed: 0_level_0,2008_Avg_ZHVI,2009_Avg_ZHVI,2010_Avg_ZHVI,2011_Avg_ZHVI,2012_Avg_ZHVI,2013_Avg_ZHVI,2014_Avg_ZHVI,2015_Avg_ZHVI,2016_Avg_ZHVI,2017_Avg_ZHVI,...,2010_Avg_FR,2011_Avg_FR,2012_Avg_FR,2013_Avg_FR,2014_Avg_FR,2015_Avg_FR,2016_Avg_FR,2017_Avg_FR,2018_Avg_FR,2019_Avg_FR
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK,257757.731674,255277.256173,255265.845679,252652.662037,257154.490616,261483.093559,260587.859272,263911.006225,260905.36142,265698.318409,...,0.017953,0.015444,0.01325,0.015368,0.010808,0.014436,0.007723,0.00811,0.008051,0.015248
AL,142287.102771,137912.212703,132560.954521,127438.048756,124757.877755,124216.831304,123857.45657,126152.310004,126710.281543,129905.84331,...,0.02862,0.024541,0.022358,0.036341,0.038174,0.036396,0.029826,0.024794,0.021197,0.01858
AR,109428.506791,105079.607528,102444.345622,100620.50496,100541.646575,101893.995841,102221.952317,102801.30943,104403.715475,106864.871102,...,0.034028,0.024479,0.011072,0.017768,0.024253,0.012681,0.012304,0.011055,0.011784,0.012021
AZ,257791.880246,222270.168886,208525.831265,192496.057547,192090.256004,210370.328371,219667.363194,224545.767161,235508.485779,245929.407627,...,0.138657,0.132389,0.072265,0.059775,0.046469,0.031105,0.021606,0.016913,0.011268,0.009412
CA,468233.71145,423004.034608,422985.651603,401367.360069,406446.894985,446805.162324,476875.116913,503387.102992,536483.483562,569431.051039,...,0.212939,0.205428,0.145055,0.065362,0.03688,0.028936,0.02203,0.013541,0.009642,0.0066


In [13]:
base_url = "https://maps.googleapis.com/maps/api/geocode/json"

params = {
    "key": gkey,
    "address": 27606
}

response = requests.get(base_url, params).json()
print(json.dumps(response, indent=4, sort_keys=True))

lat = response['results'][0]['geometry']['location']['lat']
lng = response['results'][0]['geometry']['location']['lng']
lat
lng

{
    "results": [
        {
            "address_components": [
                {
                    "long_name": "27606",
                    "short_name": "27606",
                    "types": [
                        "postal_code"
                    ]
                },
                {
                    "long_name": "Raleigh",
                    "short_name": "Raleigh",
                    "types": [
                        "locality",
                        "political"
                    ]
                },
                {
                    "long_name": "Wake County",
                    "short_name": "Wake County",
                    "types": [
                        "administrative_area_level_2",
                        "political"
                    ]
                },
                {
                    "long_name": "North Carolina",
                    "short_name": "NC",
                    "types": [
                        "administrative_area_level_1"

-78.713608

In [16]:
base_url = "https://maps.googleapis.com/maps/api/geocode/json"

zip_codes = new_main_data["ZIP Code"]

coordinates = []
params = {
    "key": gkey
}

for zip_code in zip_codes:
    try:
        params["address"] = zip_code
        geo_data = requests.get(base_url, params).json()
        new_main_data["Lat"] = geo_data['results'][0]['geometry']['location']['lat']
        new_main_data["Lng"] = geo_data['results'][0]['geometry']['location']['lng']
        #coordinates.append(f'({lat},{lng})')
    except IndexError:
        pass

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))

In [None]:
fig = gmaps.figure()

heat_layer = gmaps.heatmap_layer(coordinates)

fig.add_layer(heat_layer)

fig