In [None]:
import pandas as pd
import numpy as np

# API access
import requests
import warnings

# Progress bar and timing
from tqdm import tqdm
tqdm.pandas(desc="Progress")

warnings.simplefilter(action='ignore', category=FutureWarning)

### Function definitions

In [None]:
def session_request(url_list):
    '''
    Retrieves json-formatted web server responses for a list of urls
    Accepts: list of str
    Returns: list of dict (json objects)
    '''
    results = []
    session = requests.Session()
    for url in tqdm(url_list):
        response = session.request('GET', url)
        results.append(response.json())
    return results

In [None]:
def select_column(rank1, rank2):
    '''
    Compares two columns and returns the higher ranked column entry or combination 
    Accepts: str or null
    Returns: str, list of str, or null
    '''
    if rank1 == rank2:
        return rank1
    elif pd.notnull(rank1) and pd.notnull(rank2):
        return [rank1, rank2]
    elif pd.notnull(rank1):
        return rank1
    else: 
        return rank2

### Merge school-level source data with errors from Part 1

In [None]:
school = pd.read_csv('./files/ccd_1819_directory_rev.csv', low_memory=False, dtype=str,
                    usecols=['STATENAME', 'NCESSCH', 'SCH_NAME', 'LSTREET1', 'LCITY', 'ST', 'LZIP', 'WEBSITE',
                            'SCH_TYPE_TEXT'])

In [None]:
place_id = school[school['SCH_TYPE_TEXT']=='Regular School'].copy().reset_index(drop=True).reset_index()

In [None]:
errors = pd.read_csv('./files/pid_response_errors.csv').merge(place_id, how='left', left_on='row', right_on='index')
errors = errors[errors['status'] != 'OK']

In [None]:
errors.to_csv('./files/pid_error_details.csv', index=False)

### Retry schools that returned errors with expanded Google Places API query

In [None]:
# API key obtained via Google Cloud Console under project gcp-gu-ppalab
# local_file = '/Users/sahithi/Documents/school/grad/brodnax/adari_places_auth.txt'
local_file = '/Users/nb775/auth/brodnax_places_auth.txt'
with open(local_file) as txtfile:
    my_key = txtfile.read().strip('\n')
# print("API Key: " + my_key)

In [None]:
g_place = list(errors['SCH_NAME']+'%20'+errors['LSTREET1']+'%20'+errors['LCITY']+'%20'+errors['ST'])

In [None]:
# Replacing the spaces in the search term with '%20' in order to make it compatible with the API
place = []
for term in g_place:
    if isinstance(term, str):
        no_space = term.replace(' ', '%20')
        place.append(no_space)
    else:
        place.append('')

In [None]:
# Setting the urls for the API 
pid_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input="
pid_param = '&inputtype=textquery&fields=place_id&key=' + my_key

In [None]:
pid_api = []
for loc in place:
    pid_api.append(pid_url + loc + pid_param)

In [None]:
len(pid_api)

In [None]:
pid_list = session_request(pid_api)

In [None]:
pid_list

In [None]:
# Get place IDs, keeping the same structure as the original dataframe
only_pid = []

for result in pid_list:
    result_list = result.get('candidates')
    if len(result_list) == 1:
        only_pid.append(result_list[0].get('place_id'))
    elif len(result_list) == 2: # If there are 2 place ids for one school I am wrapping the two place_ids in the following format (place_id 1, place_id 2)
        only_pid.append((result_list[0].get('place_id'), result_list[1].get('place_id')))
    else:
        only_pid.append(np.nan)

In [None]:
len(only_pid)

In [None]:
# Adding the place_id to the original dataframe
errors['g_pid'] = only_pid

In [None]:
errors = errors.explode('g_pid')

# Replacing the nan values in 'g_pid' as 'None'
errors['g_pid'] = errors['g_pid'].fillna('None')

# Resaving the new list of place_ids to only_pid
only_pid = list(errors['g_pid'])

# Checking the shape to make sure that only the rows with 2 place_ids got duplicated
errors.shape

In [None]:
errors.head()

### Query websites for place IDs

In [None]:
# Setting the urls for the secord part of the API
web_url = 'https://maps.googleapis.com/maps/api/place/details/json?place_id='
web_param = '&fields=name%2Cwebsite&key=' + my_key

In [None]:
# Using a for loop to generate a unique url for each place_id in order to feed that into the API
only_pid = list(errors['g_pid'])
web_api = []
for pid in only_pid:
    if pid:
        web_api.append(web_url + pid + web_param)
    else:
        web_api.append(None)

In [None]:
web_list = session_request(web_api)

In [None]:
# Grabbing school names (to ensure match) and websites and adding them to the dataframe
places_names = []
places_websites = []

for escuela in tqdm(web_list):
    if (escuela.get('status') == 'OK') == True:
        places_names.append(escuela.get('result').get('name'))
        places_websites.append(escuela.get('result').get('website'))
    else:
        places_names.append(None)
        places_websites.append(None)

In [None]:
errors['places_sch'] = places_names
errors['places_website'] = places_websites
errors.head()

In [None]:
errors['final_website'] = errors.progress_apply(lambda x: select_column(x.places_website, x.WEBSITE), axis=1)

### Exporting data for use in Part 3

In [None]:
errors.to_csv('./files/school_websites_from_errors.csv', index=False)