In [1]:
# Import modules
import pandas as pd
import json
import requests

import matplotlib.pyplot as plt
import numpy as np

# Google API Key
from config import gkey

In [2]:
# Open csv file for reading City Survey Master Data 1996-2017 csv file downloaded from http://sfgov.org/citysurvey/file/95.
# Open csv file from https://www.zillow.com/research/data/
# Save path to data set in a variable
data_file = "Resources/City Survey MASTER Data 2015-2017 Copy.csv"
data_file2 = "Resources/Zip_Zhvi_SingleFamilyResidence.csv"

In [3]:
# Use Pandas to read sfgov.org data and zillow medium single family home price
city_survey_pd = pd.read_csv(data_file, dtype={'zipcode':'str', 'deduc' : 'int'})
city_survey_pd.head()

home_pd = pd.read_csv(data_file2)
home_pd.head()

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,...,2017-05,2017-06,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02
0,84654,60657,Chicago,IL,Chicago,Cook,1,420800.0,423500.0,426200.0,...,1072300,1062400,1051500,1047200,1047300,1047300,1049600,1050900,1049400,1049600
1,90668,75070,McKinney,TX,Dallas-Fort Worth,Collin,2,,,,...,317500,318400,318500,319400,322400,324800,325400,325000,325100,325000
2,84616,60614,Chicago,IL,Chicago,Cook,3,542400.0,546700.0,551700.0,...,1522500,1514200,1511300,1513400,1513300,1508000,1503700,1495500,1487500,1487000
3,93144,79936,El Paso,TX,El Paso,El Paso,4,70900.0,71200.0,71100.0,...,114000,114100,113800,113700,114200,114300,114000,114000,114300,114200
4,91733,77084,Houston,TX,Houston,Harris,5,76700.0,76500.0,76000.0,...,158800,160700,161400,162600,164100,164400,164200,163800,164000,164400


In [4]:
# Select specific columns
sf_2017_pd = city_survey_pd[city_survey_pd.year == 2017]
sf_pd = sf_2017_pd[['year', 'zipcode', 'dage', 'dethnic', 'dlivedsf', 'deduc', 'dincome', 'movesf', 'ownrenhm']].copy()
sf_pd['zipcode'] = sf_pd['zipcode'].astype(int) 
sf_pd.head()

sf_pd.dtypes

year        int64
zipcode     int32
dage        int64
dethnic     int64
dlivedsf    int64
deduc       int32
dincome     int64
movesf      int64
ownrenhm    int64
dtype: object

In [5]:
# Count rows to see if there's any missing values.
sf_pd.count()

year        2166
zipcode     2166
dage        2166
dethnic     2166
dlivedsf    2166
deduc       2166
dincome     2166
movesf      2166
ownrenhm    2166
dtype: int64

In [6]:
# Select San Francisco single family home prices from Zillow file
# Get aveage home price for 2017

sfhome_pd = home_pd[home_pd.City == 'San Francisco']

AvePrice = sfhome_pd[['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12']].mean(axis=1)
sfhome_pd['AvePrice'] = AvePrice

homeprice_pd = sfhome_pd[['RegionName', 'City', 'State', 'AvePrice']].copy()

homeprice_pd = homeprice_pd.rename(columns={'RegionName': 'zipcode'})

homeprice_pd.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,zipcode,City,State,AvePrice
8,94109,San Francisco,CA,3942708.0
63,94110,San Francisco,CA,1428692.0
313,94122,San Francisco,CA,1301808.0
477,94112,San Francisco,CA,937533.3
715,94115,San Francisco,CA,4077917.0


In [7]:
homeprice_pd.dtypes

zipcode       int64
City         object
State        object
AvePrice    float64
dtype: object

In [None]:
# Add columns for lat and lng
homeprice_pd["Lat"] = ""
homeprice_pd["Lng"] = ""
homeprice_pd.head()

Unnamed: 0,zipcode,City,State,AvePrice,Lat,Lng
8,94109,San Francisco,CA,3942708.0,,
63,94110,San Francisco,CA,1428692.0,,
313,94122,San Francisco,CA,1301808.0,,
477,94112,San Francisco,CA,937533.3,,
715,94115,San Francisco,CA,4077917.0,,


In [None]:
# create a params dict that will be updated with new zip code iteration
params = {"key": gkey}

# Loop through the homeprice_pd and run a lat/long search for each zip code
for index, row in homeprice_pd.iterrows():
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"

    zipcode = row['zipcode']

    # update address key value
    params['address'] = f"{zipcode}"

    # make request, print url
    cities_lat_lng = requests.get(base_url, params=params)

    # convert to json
    cities_lat_lng = cities_lat_lng.json()

    homeprice_pd.set_value(
        index, "Lat", cities_lat_lng["results"][0]["geometry"]["location"]["lat"])
    homeprice_pd.set_value(
        index, "Lng", cities_lat_lng["results"][0]["geometry"]["location"]["lng"])

# Visualize to confirm lat lng appear
homeprice_pd.head()



In [None]:
# Include single family home average price and latitude and longitude.
combined_pd = pd.merge(sf_pd, homeprice_pd, on='zipcode')

combined_pd.head()

In [None]:
# Select other cities to make single family home price comparison.
list_of_cities = ["San Francisco", "Los Angeles", "San Diego", "Sacramento", "Portland", "Seattle"]

home_pd= home_pd[home_pd['City'].isin(list_of_cities)]

home_pd

In [None]:
home_price = home_pd.groupby(["City"]).mean()
home_price

In [None]:
# Save both sets of data to csv files.
combined_pd.to_csv('sfdata.csv', index=False, header=True)
home_price.to_csv('homeprice.csv', index=True, header=True)