## Setup

In [9]:
import os
import pandas as pd
import requests as re
import googlemaps

from IPython.display import JSON

In [12]:
%run -i ../notebooks/functions.py

In [3]:
#and a dictionary of the parameters to define
def API(root, search_term, param, header=None):
    """search_term must be a string of valid end point queries 
            - as specified by the relevent documentation
            
            returns JSON of results
   """
 
    #define endpoint
    if search_term is not None:
        endpoint = root + search_term
    else:
        endpoint = root
    
    #GET
    response = re.get(endpoint, params = param, headers=header)
   
    #return status code and results
    status_code, results = response.status_code, response.json()
    
    #Let's make sure it worked
    if status_code != 200:
        print('Something went wrong!')
        print(status_code)
        
    return results

In [4]:
test_param = {'latitude': 45.6387, 'longitude':-122.6615, 'radius': 1000}
test_param2 = {'ll': '45.6387,-122.6615', 'radius': 1000}
yelp_head = {"Authorization": f"Bearer {os.environ['YELP_API']}"}
four_head = {"Accept": "application/json",
                  "Authorization": os.environ['FOURSQUARE_API_KEY']}
             

In [5]:
#test it's working
print(bool(API("https://api.yelp.com/", 'v3/businesses/search', test_param, yelp_head)))
bool(API("https://api.foursquare.com/", 'v3/places/search', test_param2, four_head))

True


True

## JSON to DataFrame

See MiniProject 4 Google Doc for more details

In [6]:
#roots of the different datasets
#Housing
construction_root = 'https://data.cityofnewyork.us/resource/hg8x-zxpr.json'
hbd_root = 'https://data.cityofnewyork.us/resource/kj4p-ruqc.json'

#Areas of Interest
facilities_root = 'https://data.cityofnewyork.us/resource/ji82-xba5.json'
places_root = 'https://data.cityofnewyork.us/resource/ssdk-4qjy.json'
parks_root = 'https://data.cityofnewyork.us/resource/enfh-gkve.json'
hotels_root = 'https://data.cityofnewyork.us/resource/tjus-cn27.json'
library_root = 'https://data.cityofnewyork.us/resource/feuq-due4.json'

#Business
business_root = 'https://data.cityofnewyork.us/resource/w7w3-xahh.json'

In [7]:
#function to retrieve data from SODA datasets within inputted time frame
def range_SODA(root, column, time_range, params=None):
    #set endpoint
    range_endpoint = f"?$where=project_start_date between '{time_range[0]}' and '{time_range[1]}'"
    
    endpoint = root + range_endpoint
    
    results = API(endpoint, None, params)
    
    return results

In [8]:
def JSON_to_DF(JSON, desired_features):
 
    columns = {}
    
    #for each desired column
    for feat in desired_features: 
        #empty value list
        values = []
        #for each project
        for element in JSON:
            #try to append value
            try:
                values.append(element[feat])
            #if error means no value append NaaN
            except:
                values.append('NaaN')
            
        columns[feat] = values

    #make dict into pandas dataframe        
    return pd.DataFrame(columns)                    

In [9]:
def coord_from_geom(dataframe):
    latitude = []
    longitude = []

    for row in range(dataframe.shape[0]):
        #latitude
        latitude.append(dataframe.the_geom[row]['coordinates'][1])
        
        #longitude
        longitude.append(dataframe.the_geom[row]['coordinates'][0])

    dataframe['latitude'] = latitude
    dataframe['longitude'] = longitude
    
    dataframe = dataframe.drop('the_geom', axis = 1)
    
    return dataframe

### Libraries, Parks & Hotels

In [11]:
#-- API calls-- 
#for the smaller datasets, no filter required
library_JSON = API(library_root, None, None)
#only pull active parks
parks_JSON = API(parks_root, '?retired=false&$limit=3000', None)
hotels_JSON = API(hotels_root, '?taxyear=2021&$limit=6000', None)

#### Libraries

In [None]:
#pull out desired feats from library
desired_feat = ['the_geom']
library_geom = JSON_to_DF(library_JSON, desired_feat)

#convert geometry point data to coordinates
library_df = coord_from_geom(library_geom)
print(library_df.shape)
library_df.head()

#### Parks

In [None]:
#pull out desired feats from parks
desired_feat = ['multipolygon']
parks_street = JSON_to_DF(parks_JSON, desired_feat)
print(parks_street.shape)
parks_street.head()

In [None]:
#function to find centroid from list of coordinates
def centroid(vertexes):
    _x_list = [vertex [0] for vertex in vertexes]
    _y_list = [vertex [1] for vertex in vertexes]
    _len = len(vertexes)
    _x = sum(_x_list) / _len
    _y = sum(_y_list) / _len
    return(_x, _y)

In [None]:
#define empty lat, long lists
latitude = []
longitude = []

#for each row 
for row in range(parks_street.shape[0]):
    #pull out polygon coordinates
    polygon = parks_street.multipolygon[row]['coordinates'][0][0]
    #find the center
    center = centroid(polygon)
    #append to lat and long (coordinate order is swapped)
    #latitude
    latitude.append(center[1])
    #longitude
    longitude.append(center[0])

parks_street['latitude'] = latitude
parks_street['longitude'] = longitude
    
parks_df = parks_street.drop('multipolygon', axis = 1)
parks_df.head()

In [None]:
print(parks_df.shape)
parks_df.dtypes

#### Hotels

In [None]:
#transform from JSON to DF
desired_feats = ['latitude', 'longitude']
hotels_df = JSON_to_DF(hotels_JSON, desired_feats)
print(hotels_df.shape)
hotels_df.head()

In [None]:
hotels_df = hotels_df.apply(pd.to_numeric, errors='coerce')

#### Facilities

Bigger sets, want to pull just based on specific categories so we're getting a clearer picture

See `facilities_datadictionary.xlsx` for possible facility category, subcatory, and factype

In [None]:
#--- ORIGINAL SUBGROUPS --
#define subgroups we want
facility_subgroup = ['BUS DEPOTS AND TERMINALS', 'PARKING LOTS AND GARAGES', 
                     'MUSEUMS', 'STREETSCAPES, PLAZAS, AND MALLS', 
                     'NON-PUBLIC K-12 SCHOOLS', 'PUBLIC K-12 SCHOOLS', 'COLLEGES OR UNIVERSITIES', 
                    'HOSPITALS AND CLINICS'] 

In [None]:
#--- EXPERIMENT SUBGROUPS --
facility_subgroup = ['PUBLIC K-12 SCHOOLS', 'COLLEGES OR UNIVERSITIES',
                    'HOSPITALS AND CLINICS', 'DAY CARE'] 

In [None]:
#-- API call--

#make dict to store JSON returns
facility_JSONs = {}

#loop through each subgroup and call JSON
for subgroup in facility_subgroup:
    #define endpoint with higher limit
    endpoint = facilities_root + f"?facsubgrp={subgroup}&$limit=3000"
    #GET
    response = re.get(endpoint, 
                      data={'app_token': os.environ['NYC_TOKEN']})
   
    #return status code and results
    status_code, facility_JSONs[subgroup] = response.status_code, response.json()

In [None]:
#number of return values for each JSON
for JSON in facility_JSONs:
    print(JSON, len(facility_JSONs[JSON]))

In [None]:
#name lat, long columns
facility_features = ['latitude', 'longitude']
#create empty dict
facility_df = {}
#for every JSON, translate to pandas DF and store into dict under its name
for JSON in facility_JSONs:
    facility_df[JSON] = JSON_to_DF(facility_JSONs[JSON], facility_features)

In [None]:
#test print a facility dataframe
facility_df[facility_subgroup[0]].head()

In [None]:
#concatinate all facilities into one dataframe, using title as factype column
facility_all = pd.concat(facility_df).reset_index().drop('level_1', axis = 1)
facility_all = facility_all.rename(columns={'level_0': 'factype'})
print(facility_all.shape)
facility_all.head()

In [None]:
facility_clean = facility_all.loc[facility_all['latitude'] != 0]

In [None]:
#replace object lat_long with floata
facility_clean[['latitude', 'longitude']] = facility_clean[['latitude', 'longitude']].apply(pd.to_numeric, errors='coerce')
facility_clean.dtypes

### To csv

In [None]:
#save final dataframes to csv
library_df.to_csv('../Mid-term/Mid-Term-Project/processed_data/libraries.csv', index=False)
parks_df.to_csv('../Mid-term/Mid-Term-Project/processed_data/parks.csv', index=False)
hotels_df.to_csv('../Mid-term/Mid-Term-Project/processed_data/hotels.csv', index=False)
facility_clean.to_csv('../Mid-term/Mid-Term-Project/processed_data/facilities.csv', index=False)
house_start_df.to_csv('../Mid-term/Mid-Term-Project/processed_data/house_start.csv', index=False)


### Housing

#### House construction

In [None]:
#-- API call -- 

#call housing data within range
housing_start_JSON = range_SODA(construction_root, 'project_start_date', ['2021-01-01T00:00:00', '2021-12-30T00:00:00'])
housing_complete_JSON = range_SODA(construction_root, 'project_completion_date', ['2021-01-01T00:00:00', '2021-12-30T00:00:00'])

In [None]:
#id desired features
desired_feat = ['total_units', 'latitude', 'longitude']

In [None]:
house_start_df = JSON_to_DF(housing_start_JSON, desired_feat)
print(house_start_df.shape)

In [None]:
house_start_df = house_start_df.apply(pd.to_numeric, errors='coerce')
house_start_df.head()

In [None]:
house_end_df = JSON_to_DF(housing_complete_JSON, desired_feat)
house_end_df.shape

#### HPD buildings

In [None]:
print(hbd_root)

In [12]:
#-- API call -- 
#call housing data within range
hbd_JSON = API(hbd_root,'?recordstatus=Active&lifecycle=Building&$limit=50000',
              None)

In [13]:
desired_feat = ['zip', 'legalstories']

hbd_df = JSON_to_DF(hbd_JSON, desired_feat)

In [14]:
hbd_df.head()

Unnamed: 0,zip,legalstories
0,10466,1
1,11216,3
2,10314,2
3,11375,2
4,11226,2


*Note*: the next part is commented out due to the generation of data from google API costing money. We can't rerun this part but we did get the data necessary out of it. See `building_count.csv` and `building_stories.csv`

In [10]:
#gmaps = googlemaps.Client(key=os.environ['TEMP_GOOGLE'])

In [None]:
#lat = []
#long = []

#-- DO NOT RUN -- 

#for zip_code in hbd_df.zip:

    #geocode_result = gmaps.geocode(f'{zip_code}, NY')
    #try:
        #lat.append(geocode_result[0]['geometry']['location']['lat'])
        #long.append(geocode_result[0]['geometry']['location']['lng'])
    #except:
        #lat.append('NaN')
        #long.append('NaN')

In [None]:
#append lat and long
#hbd_df['latitude'] = lat
#hbd_df['longitude'] = long

#drop zip codes
#hbd_df = hbd_df.drop('zip', axis=1)
#hbd_df = hbd_df.apply(pd.to_numeric, errors='coerce')

In [None]:
#hbd_df.info()

In [None]:
#NTA = pd.read_csv('../Mid-Term/Mid-Term-Project/processed_data/geocoded_population.csv').loc[:, ['latitude', 'longitude']]
#print(NTA.shape)
#NTA.head()

In [None]:
#bin within the facilities
#building_count = bin_data(hbd_df,'count', 'buildings', 70)
#building_level = bin_data(hbd_df, 'mean', 'buildings', 1)

In [None]:
#building_level.head()

In [None]:
#building_count.loc[building_count['buildings'] != 0].describe()

In [None]:
# -- DO NOT OVERWRITE SAVED BUILDING DATA --

#building_count.to_csv('../Mid-term/Mid-Term-Project/processed_data/building_count.csv', index=False)
#building_level.to_csv('../Mid-term/Mid-Term-Project/processed_data/building_stories.csv', index=False)