## Setup

In [1]:
import os
import pandas as pd
import requests as re
from IPython.display import JSON

In [3]:
#and a dictionary of the parameters to define
def API(root, search_term, param, header=None):
    """search_term must be a string of valid end point queries 
            - as specified by the relevent documentation
            
            returns JSON of results
   """
 
    #define endpoint
    if search_term is not None:
        endpoint = root + search_term
    else:
        endpoint = root
    
    #GET
    response = re.get(endpoint, params = param, headers=header)
   
    #return status code and results
    status_code, results = response.status_code, response.json()
    
    #Let's make sure it worked
    if status_code != 200:
        print('Something went wrong!')
        print(status_code)
        
    return results

In [12]:
test_param = {'latitude': 45.6387, 'longitude':-122.6615, 'radius': 1000}
test_param2 = {'ll': '45.6387,-122.6615', 'radius': 1000}
yelp_head = {"Authorization": f"Bearer {os.environ['YELP_API']}"}
four_head = {"Accept": "application/json",
                  "Authorization": os.environ['FOURSQUARE_API_KEY']}
             

In [13]:
#test it's working
print(bool(API("https://api.yelp.com/", 'v3/businesses/search', test_param, yelp_head)))
bool(API("https://api.foursquare.com/", 'v3/places/search', test_param2, four_head))

True


True

## JSON to DataFrame

See MiniProject 4 Google Doc for more details

In [4]:
#set standard variables
range_start = '2021-02-01T00:00:00'
range_end = '2021-02-28T00:00:00'
range_21 = [range_start, range_end]

#roots of the different datasets
#Housing
housing_root = 'https://data.cityofnewyork.us/resource/hg8x-zxpr.json'

#Areas of Interest
facilities_root = 'https://data.cityofnewyork.us/resource/ji82-xba5.json'
places_root = 'https://data.cityofnewyork.us/resource/ssdk-4qjy.json'
parks_root = 'https://data.cityofnewyork.us/resource/enfh-gkve.json'
hotels_root = 'https://data.cityofnewyork.us/resource/tjus-cn27.json'
library_root = 'https://data.cityofnewyork.us/resource/feuq-due4.json'

#Business
business_root = 'https://data.cityofnewyork.us/resource/w7w3-xahh.json'

In [65]:
#function to retrieve data from SODA datasets within inputted time frame
def range_SODA(root, column, time_range, params=None):
  
    range_endpoint = f"?$where=project_start_date between '{time_range[0]}' and '{time_range[1]}'"
    
    endpoint = root + range_endpoint
    
    results = API(endpoint, None, params)
    
    return results

In [28]:
def JSON_to_DF(JSON, desired_features):
 
    columns = {}
    
    #for each desired column
    for feat in desired_features: 
        #empty value list
        values = []
        #for each project
        for element in JSON:
            #try to append value
            try:
                values.append(element[feat])
            #if error means no value append NaaN
            except:
                values.append('NaaN')
            
        columns[feat] = values

    #make dict into pandas dataframe        
    return pd.DataFrame(columns)                    

In [308]:
def coord_from_geom(dataframe):
    latitude = []
    longitude = []

    for row in range(dataframe.shape[0]):
        #latitude
        latitude.append(dataframe.the_geom[row]['coordinates'][1])
        
        #longitude
        longitude.append(dataframe.the_geom[row]['coordinates'][0])

    dataframe['latitude'] = latitude
    dataframe['longitude'] = longitude
    
    dataframe = dataframe.drop('the_geom', axis = 1)
    
    return dataframe

### House construction

In [66]:
#-- API call -- 

#call housing data within range
housing_start_JSON = range_SODA(housing_root, 'project_start_date', range_21)
housing_complete_JSON = range_SODA(housing_root, 'project_completion_date', ['2020-11-01T00:00:00', '2021-03-01T00:00:00'])

In [276]:
#id desired features
desired_feat = ['all_counted_units', 'total_units', 'latitude', 'longitude']

In [277]:
house_start_df = JSON_to_DF(housing_start_JSON, desired_feat)
house_start_df.head()

Unnamed: 0,all_counted_units,total_units,latitude,longitude
0,9,28,40.844171,-73.892197
1,15,48,40.648856,-73.964536
2,3,10,40.690236,-73.928624
3,3,10,40.690252,-73.928495
4,5,16,40.653856,-73.954709


In [69]:
house_end_df = JSON_to_DF(housing_complete_JSON, desired_feat)
house_end_df.head()

Unnamed: 0,latitude,longitude,all_counted_units,total_units
0,40.766618,-73.925719,10,31
1,40.858099,-73.905388,8,25
2,40.680762,-73.966791,34,113
3,40.639152,-73.984225,3,8
4,40.747525,-73.893776,47,154


### Libraries, Parks & Hotels

In [224]:
#-- API calls-- 
#for the smaller datasets, no filter required
library_JSON = API(library_root, None, None)
#only pull active parks
parks_JSON = API(parks_root, '?retired=false&$limit=3000', None)
hotels_JSON = API(hotels_root, '?taxyear=2021&$limit=6000', None)

#### Libraries

In [309]:
#pull out desired feats from library
desired_feat = ['the_geom']
library_geom = JSON_to_DF(library_JSON, desired_feat)

#convert geometry point data to coordinates
library_df = coord_from_geom(library_geom)
print(library_df.shape)
library_df.head()

(216, 2)


Unnamed: 0,latitude,longitude
0,40.80298,-73.953531
1,40.803018,-73.934848
2,40.760807,-73.977363
3,40.762186,-73.969382
4,40.764915,-73.95955


#### Parks

In [186]:
#pull out desired feats from parks
desired_feat = ['multipolygon', 'acres']
parks_street = JSON_to_DF(parks_JSON, desired_feat)
print(parks_street.shape)
parks_street.head()

(2025, 2)


Unnamed: 0,multipolygon,acres
0,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",21.10936646
1,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",0.89
2,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",5.739
3,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",0.001
4,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",0.032


In [187]:
#function to find centroid from list of coordinates
def centroid(vertexes):
    _x_list = [vertex [0] for vertex in vertexes]
    _y_list = [vertex [1] for vertex in vertexes]
    _len = len(vertexes)
    _x = sum(_x_list) / _len
    _y = sum(_y_list) / _len
    return(_x, _y)

In [188]:
#define empty lat, long lists
latitude = []
longitude = []

#for each row 
for row in range(parks_street.shape[0]):
    #pull out polygon coordinates
    polygon = parks_street.multipolygon[row]['coordinates'][0][0]
    #find the center
    center = centroid(polygon)
    #append to lat and long (coordinate order is swapped)
    #latitude
    latitude.append(center[1])
    #longitude
    longitude.append(center[0])

parks_street['latitude'] = latitude
parks_street['longitude'] = longitude
    
parks_df = parks_street.drop('multipolygon', axis = 1)
parks_df.head()

Unnamed: 0,acres,latitude,longitude
0,21.10936646,40.796328,-73.89752
1,0.89,40.682928,-73.930628
2,5.739,40.69034,-73.999386
3,0.001,40.730382,-73.87496
4,0.032,40.621921,-74.022569


#### Hotels

In [281]:
#transform from JSON to DF
desired_feats = ['latitude', 'longitude']
hotels_df = JSON_to_DF(hotels_JSON, desired_feats)
print(hotels_df.shape)
hotels_df.head()

(2731, 2)


Unnamed: 0,latitude,longitude
0,40.703235,-74.012421
1,40.702744,-74.012201
2,40.704025,-74.012638
3,40.704039,-74.012317
4,40.714812,-74.016153


In [298]:
hotels_df = hotels_df.apply(pd.to_numeric, errors='coerce')

#### Facilities

Bigger sets, want to pull just based on specific categories so we're getting a clearer picture

See `facilities_datadictionary.xlsx` for possible facility category, subcatory, and factype

In [48]:
#-- API call--

#make dict to store JSON returns
facility_JSONs = {}

#define subgroups we want
facility_subgroup = ['PARKING LOTS AND GARAGES', 'STREETSCAPES, PLAZAS, AND MALLS', 'MUSEUMS',
                     'NON-PUBLIC K-12 SCHOOLS', 'PUBLIC K-12 SCHOOLS', 'COLLEGES OR UNIVERSITIES', 
                    'HOSPITALS AND CLINICS', 'BUS DEPOTS AND TERMINALS'] 

#loop through each subgroup and call JSON
for subgroup in facility_subgroup:
    #define endpoint with higher limit
    endpoint = facilities_root + f"?facsubgrp={subgroup}&$limit=3000"
    #GET
    response = re.get(endpoint, 
                      data={'app_token': os.environ['NYC_TOKEN']})
   
    #return status code and results
    status_code, facility_JSONs[subgroup] = response.status_code, response.json()

In [49]:
#number of return values for each JSON
for JSON in facility_JSONs:
    print(JSON, len(facility_JSONs[JSON]))

PARKING LOTS AND GARAGES 2559
STREETSCAPES, PLAZAS, AND MALLS 560
MUSEUMS 138
NON-PUBLIC K-12 SCHOOLS 1113
PUBLIC K-12 SCHOOLS 1515
COLLEGES OR UNIVERSITIES 132
HOSPITALS AND CLINICS 1191
BUS DEPOTS AND TERMINALS 144


In [222]:
#name lat, long columns
facility_features = ['latitude', 'longitude']
#create empty dict
facility_df = {}
#for every JSON, translate to pandas DF and store into dict under its name
for JSON in facility_JSONs:
    facility_df[JSON] = JSON_to_DF(facility_JSONs[JSON], facility_features)

In [196]:
#test print a facility dataframe
facility_df[facility_subgroup[0]].head()

Unnamed: 0,latitude,longitude
0,40.6033903449,-73.9967793904
1,40.7125522199,-73.9533410191
2,40.7018273113,-73.8173146273
3,40.7687161237,-73.9572008687
4,40.8245918496,-73.8433819236


In [205]:
#concatinate all facilities into one dataframe, using title as factype column
facility_all = pd.concat(facility_df).reset_index().drop('level_1', axis = 1)
facility_all = facility_all.rename(columns={'level_0': 'factype'})
print(facility_all.shape)
facility_all.head()

(7352, 3)


Unnamed: 0,factype,latitude,longitude
0,PARKING LOTS AND GARAGES,40.6033903449,-73.9967793904
1,PARKING LOTS AND GARAGES,40.7125522199,-73.9533410191
2,PARKING LOTS AND GARAGES,40.7018273113,-73.8173146273
3,PARKING LOTS AND GARAGES,40.7687161237,-73.9572008687
4,PARKING LOTS AND GARAGES,40.8245918496,-73.8433819236


In [306]:
#replace object lat_long with floata
facility_all[['latitude', 'longitude']] = facility_all[['latitude', 'longitude']].apply(pd.to_numeric, errors='coerce')
facility_all.dtypes

factype       object
latitude     float64
longitude    float64
dtype: object

### Binning work

In [235]:
#get NTA centroid list
NTA = pd.read_csv('../Mid-term/Mid-Term-Project/processed_data/geocoded_population.csv').loc[:, ['latitude', 'longitude']]
NTA.head()

Unnamed: 0,latitude,longitude
0,40.768352,-73.809546
1,40.763352,-73.868396
2,40.734894,-73.783716
3,40.755734,-73.983503
4,40.818055,-73.856188


In [249]:
def bin_test(dataframe):
    
    for i in range(dataframe.shape[0]):
        df_lat, df_long = dataframe.latitude[i], dataframe.longitude[i]
    
        difference = []
        
        for lat, long in NTA.values:
            print(abs(df_lat - lat))

### To csv

In [310]:
#save final dataframes to csv
library_df.to_csv('../Mid-term/Mid-Term-Project/processed_data/libraries.csv')
parks_df.to_csv('../Mid-term/Mid-Term-Project/processed_data/parks.csv')
hotels_df.to_csv('../Mid-term/Mid-Term-Project/processed_data/hotels.csv')
facility_all.to_csv('../Mid-term/Mid-Term-Project/processed_data/facilities.csv')

## Discontinued

#### Business

In [159]:
#within range
business_started = range_SODA(business_root, 'license_creation_date', [range_start,range_end])
business_ended = range_SODA(business_root, 'lic_expir_dd', [range_start,range_end])
#all businesses with active license [may have limit?]
business_active = API('https://data.cityofnewyork.us/resource/w7w3-xahh.json', None, {'license_status': 'Active'})

Something went wrong!
400
Something went wrong!
400


#### Places

In [None]:
#-- API Call --
places_JSON = API(places_root, None, None)

In [101]:
#pull out desired feats from places
desired_feat = ['annoline2a', 'the_geom']
places_geom = JSON_to_DF(places_JSON, desired_feat)

#convert geometry point data to coordinates
places_df = coord_from_geom(places_geom)
print(places_df.shape)
places_df.head()

(96, 3)


Unnamed: 0,annoline2a,latitude,longitude
0,Cortlandt,-73.888958,40.89621
1,Cemetery,-73.871651,40.889879
2,Falls,-73.838642,40.886965
3,Bay,-73.809802,40.877986
4,Garden,-73.878308,40.864424
