In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from xml.etree import ElementTree
from config import z_key

# MTA info

In [2]:
mta_data = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
mta_data.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575


## Package to get address
https://pypi.org/project/geopy/

In [3]:
from uszipcode import Zipcode
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True) 

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="this is a test app")
zipcode = []
address = []
for index, row in mta_data.iterrows():
    # Search zipcode within 2 miles, ordered from closest to farthest
    tmp = search.by_coordinates(row['GTFS Latitude'], row['GTFS Longitude'], radius=2, returns=1)
    if tmp:
        zipcode.append(tmp[0].zipcode)
        
    # Search address based on lan&lon
    location = geolocator.reverse(f"{row['GTFS Latitude']}, {row['GTFS Longitude']}")
    if location.address:
        address.append(location.address)
        
mta_data['Zip Code'] = zipcode
mta_data['Address'] = address
                                  
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)
from operator import methodcaller
new_add_list = list(map(methodcaller("split", ", "), mta_data["Address"]))

mta_data["Zip Code2"] = [elem[-2] for elem in new_add_list]
mta_data["Address2"] = [f"{elem[0]}, {elem[1]}" for elem in new_add_list]
                                  
# mta_data.to_csv("Resources/mta_with_zip.csv")
mta_data.head()


Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,Zip Code,Address,Zip Code2,Address2
0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,11105,"Astoria-Ditmars Boulevard, 31st Street, Steinway, Queens County, NYC, New York, 11101, USA",11101,"Astoria-Ditmars Boulevard, 31st Street"
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,11102,"Astoria Boulevard, Hoyt Avenue South, Astoria, Queens County, NYC, New York, NY 11106, USA",NY 11106,"Astoria Boulevard, Hoyt Avenue South"
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,11102,"30th Avenue, 31st Street, Astoria, Queens County, NYC, New York, 11101, USA",11101,"30th Avenue, 31st Street"
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508,11102,"Broadway, 31st Street, Astoria, Queens County, NYC, New York, 11101, USA",11101,"Broadway, 31st Street"
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,11106,"35-53, 31st Street, Sunnyside Gardens, Queens County, NYC, New York, 11106, USA",11106,"35-53, 31st Street"


In [4]:
mta_data.to_csv("Resources/mta_data_with_zip_address.csv")

# Zillow api

Over 1,000 calls in a day, API calls will be blocked

run up to 20 API calls on one page at one time

## GetDeepSearch-Results API
* http://www.zillow.com/webservice/GetDeepSearchResults.htm

Example:
Below is an example of calling the API for the address for the exact address match "2114 Bigelow Ave", "Seattle, WA":
http://www.zillow.com/webservice/GetDeepSearchResults.htm?zws-id=ZWSID&address=2114+Bigelow+Ave&citystatezip=Seattle%2C+WA
    

In [58]:
mta_data["Address3"] = [elem[0] for elem in new_add_list]
mta_data["Address4"] = [elem[1] for elem in new_add_list]
mta_data.head(100)

Unnamed: 0.1,Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,Zip Code,Address,Zip Code2,Address2,Address3,Address4
0,0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,11105,"Astoria-Ditmars Boulevard, 31st Street, Steinway, Queens County, NYC, New York, 11101, USA",11101,"Astoria-Ditmars Boulevard, 31st Street",Astoria-Ditmars Boulevard,31st Street
1,1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,11102,"Astoria Boulevard, Hoyt Avenue South, Astoria, Queens County, NYC, New York, NY 11106, USA",NY 11106,"Astoria Boulevard, Hoyt Avenue South",Astoria Boulevard,Hoyt Avenue South
2,2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,11102,"30th Avenue, 31st Street, Astoria, Queens County, NYC, New York, 11101, USA",11101,"30th Avenue, 31st Street",30th Avenue,31st Street
3,3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.761820,-73.925508,11102,"Broadway, 31st Street, Astoria, Queens County, NYC, New York, 11101, USA",11101,"Broadway, 31st Street",Broadway,31st Street
4,4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,11106,"35-53, 31st Street, Sunnyside Gardens, Queens County, NYC, New York, 11106, USA",11106,"35-53, 31st Street",35-53,31st Street
5,5,6,6,R08,BMT,Astoria,39 Av,Q,N W,Elevated,40.752882,-73.932755,11106,"38-34, 31st Street, Sunnyside, Queens County, NYC, New York, 11101, USA",11101,"38-34, 31st Street",38-34,31st Street
6,6,7,613,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.762660,-73.967258,10065,"149, East 60th Street, Upper East Side, Lenox Hill, Manhattan, Manhattan Community Board 8, New York County, NYC, New York, 10065, USA",10065,"149, East 60th Street",149,East 60th Street
7,7,8,8,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347,10153,"Grand Army Plaza, Upper East Side, Manhattan, New York County, NYC, New York, 10153, USA",10153,"Grand Army Plaza, Upper East Side",Grand Army Plaza,Upper East Side
8,8,9,9,R14,BMT,Broadway - Brighton,57 St - 7 Av,M,N Q R W,Subway,40.764664,-73.980658,10103,"Park Central Hotel, 860, 7th Avenue, Diamond District, Midtown, Manhattan, Manhattan Community Board 5, New York County, NYC, New York, 10019, USA",10019,"Park Central Hotel, 860",Park Central Hotel,860
9,9,10,10,R15,BMT,Broadway - Brighton,49 St,M,N R W,Subway,40.759901,-73.984139,10020,"721, 7th Avenue, Times Square, Manhattan, Manhattan Community Board 5, New York County, NYC, New York, 10019, USA",10019,"721, 7th Avenue",721,7th Avenue


In [74]:
from xmljson import badgerfish as bf
import json
from json import dumps

zillow_url = "http://www.zillow.com/webservice/GetDeepSearchResults.htm"
params = { 'zws-id': z_key }
results_fetch_key = "{http://www.zillow.com/static/xsd/SearchResults.xsd}searchresults"

zpid = []
latitude = []
longitude = []
bathrooms = []
bedrooms = []
zipcode = []
price = []
lotSizeSqFt = []
usecode = []
for index, row in mta_data.iterrows():
    params['address'] = row['Stop Name']
    params['citystatezip'] = row['Zip Code2']
    
    response = requests.get(zillow_url, params=params)
    
    json_data = bf.data(fromstring(response.content))    
    if json_data[results_fetch_key]["message"]["code"]["$"] == 0:
        results = json_data[results_fetch_key]['response']['results']['result']
        if not isinstance(results, list):
            results = [results]
            
        for house in results:
            zpid.append(house['zpid']['$'])
            latitude.append(house['address']['latitude']['$'])
            longitude.append(house['address']['longitude']['$'])
            usecode.append(house['useCode']['$'])
            
            if 'zipcode' in house['address'] and '$' in house['address']['zipcode']:
                zipcode.append(house['address']['zipcode']['$'])
            else:
                zipcode.append(0)
            
            if 'bathrooms' in house:
                bathrooms.append(house['bathrooms']['$'])
            else:
                bathrooms.append(0)

            if 'bedrooms' in house:
                bedrooms.append(house['bedrooms']['$'])
            else:
                bedrooms.append(0)

            if 'amount' in house['zestimate'] and '$' in house['zestimate']['amount']:
                price.append(house['zestimate']['amount']['$'])
            else:
                price.append(0)

            if 'lotSizeSqFt' in house:
                lotSizeSqFt.append(house['lotSizeSqFt']['$'])
            else:
                lotSizeSqFt.append(0)           
                    
#     print(dumps(json_data, indent=4, sort_keys=True) )
#     break
    
df = pd.DataFrame(columns=['zpid'])
df['zpid'] = zpid
df['latitude'] = latitude
df['longitude'] = longitude
df['usecode'] = usecode
df['bathrooms'] = bathrooms
df['bedrooms'] = bedrooms
df['zipcode'] = zipcode
df['price'] = price
df['lotSizeSqFt'] = lotSizeSqFt
df.head()
    


Unnamed: 0,zpid,latitude,longitude,usecode,bathrooms,bedrooms,zipcode,price,lotSizeSqFt
0,31947881,40.780026,-73.916672,SingleFamily,1.0,3,11105,1297999,2500
1,94721483,40.77083,-73.90318,MultiFamily2To4,1.0,2,11105,687975,0
2,2095007549,40.769199,-73.901496,MultiFamily2To4,4.0,5,11105,1141827,0
3,2099075333,40.77951,-73.91547,MultiFamily2To4,2.0,5,11105,1177104,2500
4,2086385859,40.781942,-73.918467,MultiFamily2To4,1.0,2,11105,593844,0


In [75]:
df.to_csv("Resources/Mar_3rd_stopname_zipcode2_df.csv")
df.shape

(5574, 9)

In [8]:
df = pd.read_csv("Resources/sample_df.csv")

# Combine all house info file

In [142]:
sn_zp = pd.read_csv("Resources/Mar_3rd_stopname_zipcode_df.csv")
ad2_zp = pd.read_csv("Resources/Mar_3rd_address2_zipcode_df.csv")
ad3_zp = pd.read_csv("Resources/Mar_3rd_address3_zipcode_df.csv")
ad4_zp = pd.read_csv("Resources/Mar_3rd_address4_zipcode_df.csv")
sn_zp2 = pd.read_csv("Resources/Mar_3rd_stopname_zipcode2_df.csv")
ad2_zp2 = pd.read_csv("Resources/Mar_3rd_address2_zipcode2_df.csv")
ad3_zp2 = pd.read_csv("Resources/Mar_3rd_address3_zipcode2_df.csv")
ad4_zp2 = pd.read_csv("Resources/Mar_3rd_address4_zipcode2_df.csv")

In [143]:
print(sn_zp.shape)
print(ad2_zp.shape)
print(ad3_zp.shape)
print(ad4_zp.shape)
print(sn_zp2.shape)
print(ad2_zp2.shape)
print(ad3_zp2.shape)
print(ad4_zp2.shape)

(5208, 10)
(989, 10)
(4026, 10)
(6187, 10)
(5574, 10)
(849, 10)
(3916, 10)
(6238, 10)


In [184]:
combo_df = sn_zp
combo_df = combo_df.append(ad2_zp, sort=False)
combo_df = combo_df.append(ad3_zp, sort=False)
combo_df = combo_df.append(ad4_zp, sort=False)
combo_df = combo_df.append(sn_zp2, sort=False)
combo_df = combo_df.append(ad2_zp2, sort=False)
combo_df = combo_df.append(ad3_zp2, sort=False)
combo_df = combo_df.append(ad4_zp2, sort=False)
combo_df.shape

(32987, 10)

In [185]:
unique_df = combo_df.drop_duplicates(subset='zpid')
unique_df.shape

(15282, 10)

In [186]:
unique_df = unique_df.reset_index().drop(['index', 'Unnamed: 0'], axis=1)
unique_df.head()

Unnamed: 0,zpid,latitude,longitude,usecode,bathrooms,bedrooms,zipcode,price,lotSizeSqFt
0,31947881,40.780026,-73.916672,SingleFamily,1.0,3,11105,1297999,2500
1,94721614,40.77083,-73.90318,MultiFamily2To4,1.0,1,11105,543865,0
2,31947885,40.779837,-73.916406,SingleFamily,1.0,3,11105,1402414,2500
3,31943243,40.768852,-73.900892,Triplex,0.0,0,11105,1290066,2200
4,31943244,40.768754,-73.900791,Duplex,0.0,0,11105,1150027,2180


In [187]:
unique_df['zpid'].nunique()

15282

In [188]:
unique_df = unique_df[unique_df['price'] != 0]
print(unique_df.shape)
unique_df.to_csv("Resources/Combine_home_data.csv")

(11469, 9)


In [189]:
unique_df.head()

Unnamed: 0,zpid,latitude,longitude,usecode,bathrooms,bedrooms,zipcode,price,lotSizeSqFt
0,31947881,40.780026,-73.916672,SingleFamily,1.0,3,11105,1297999,2500
1,94721614,40.77083,-73.90318,MultiFamily2To4,1.0,1,11105,543865,0
2,31947885,40.779837,-73.916406,SingleFamily,1.0,3,11105,1402414,2500
3,31943243,40.768852,-73.900892,Triplex,0.0,0,11105,1290066,2200
4,31943244,40.768754,-73.900791,Duplex,0.0,0,11105,1150027,2180


In [190]:
unique_df['usecode'].unique()

array(['SingleFamily', 'MultiFamily2To4', 'Triplex', 'Duplex',
       'Condominium', 'Quadruplex', 'Unknown', 'Townhouse', 'Apartment',
       'Cooperative', 'Mobile', 'Miscellaneous', 'VacantResidentialLand',
       'MultiFamily5Plus'], dtype=object)

### Calculate neareat Station

In [194]:
from scipy.spatial import distance
mat = distance.cdist(
        unique_df[['latitude', 'longitude']],
        mta_data[['GTFS Latitude', 'GTFS Longitude']],
        metric = 'euclidean')
distance_df = pd.DataFrame(mat, index=unique_df['zpid'], columns=mta_data['Station ID'])

arr = distance_df.values
unique_df['Nearest Station Index']  = [list(i).index(i.min()) for i in arr]
unique_df['Nearest Station Lat'] = [mta_data.loc[i, 'GTFS Latitude'] for i in unique_df['Nearest Station Index']]
unique_df['Nearest Station Lon'] = [mta_data.loc[i, 'GTFS Longitude'] for i in unique_df['Nearest Station Index']]

from math import sin, cos, sqrt, atan2, radians

def convert_latlon(row):
    R = 6373.0
    
    stoplat = radians(row['Nearest Station Lat'])
    stoplon = radians(row['Nearest Station Lon'])
    
    homelat = radians(row['latitude'])
    homelon = radians(row['longitude'])
    
    dlon = homelon - stoplon
    dlat = homelat - stoplat
    
    a = sin(dlat / 2)**2 + cos(stoplat) * cos(homelat) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    distance_km = R * c
    distance_m = distance_km * 0.621371
    
    return distance_m

unique_df['distance_miles'] = unique_df.apply(convert_latlon, axis=1)

In [196]:
unique_df.to_csv("Resources/final_house_data_with_nearest_station.csv")

In [197]:
unique_df.head()

Unnamed: 0,zpid,latitude,longitude,usecode,bathrooms,bedrooms,zipcode,price,lotSizeSqFt,Nearest Station Index,Nearest Station Lat,Nearest Station Lon,distance_miles
0,31947881,40.780026,-73.916672,SingleFamily,1.0,3,11105,1297999,2500,0,40.775036,-73.912034,0.421744
1,94721614,40.77083,-73.90318,MultiFamily2To4,1.0,1,11105,543865,0,0,40.775036,-73.912034,0.547056
2,31947885,40.779837,-73.916406,SingleFamily,1.0,3,11105,1402414,2500,0,40.775036,-73.912034,0.403068
3,31943243,40.768852,-73.900892,Triplex,0.0,0,11105,1290066,2200,0,40.775036,-73.912034,0.723042
4,31943244,40.768754,-73.900791,Duplex,0.0,0,11105,1150027,2180,0,40.775036,-73.912034,0.731314


### Google location api feed

In [122]:
import gmaps

# Google developer API key
from config import gkey

# Access maps with unique API key
gmaps.configure(api_key=gkey)


In [123]:
unique_df.head()

Unnamed: 0,zpid,latitude,longitude,usecode,bathrooms,bedrooms,zipcode,price,lotSizeSqFt
0,31947881,40.780026,-73.916672,SingleFamily,1.0,3,11105,1297999,2500
1,94721614,40.77083,-73.90318,MultiFamily2To4,1.0,1,11105,543865,0
2,31947885,40.779837,-73.916406,SingleFamily,1.0,3,11105,1402414,2500
3,31943243,40.768852,-73.900892,Triplex,0.0,0,11105,1290066,2200
4,31943244,40.768754,-73.900791,Duplex,0.0,0,11105,1150027,2180


In [141]:
# Store latitude and longitude in locations
locations = unique_df[["latitude", "longitude"]]

# Fill NaN values and convert to float
price = unique_df["price"].astype(float)

# Plot Heatmap
fig = gmaps.figure()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights=price, 
                                 dissipating=False, max_intensity=10,
                                 point_radius=0.001)

# Add layer
fig.add_layer(heat_layer)
fig


Figure(layout=FigureLayout(height='420px'))