In [None]:
import pandas as pd
import numpy as np

# API access
import requests
import warnings

# Progress bar and timing
from tqdm import tqdm
tqdm.pandas(desc="Progress")

warnings.simplefilter(action='ignore', category=FutureWarning)

### Function definitions

In [None]:
def url_request(url):
    '''
    Retrieves json-formatted web server response for a single url
    Accepts: str
    Returns: dict (json object)
    '''
    response = requests.request('GET', url)
    return response.json()

In [None]:
def session_request(url_list):
    '''
    Retrieves json-formatted web server responses for a list of urls
    Accepts: list of str
    Returns: list of dict (json objects)
    '''
    results = []
    session = requests.Session()
    for url in tqdm(url_list):
        response = session.request('GET', url)
        results.append(response.json())
    return results

In [None]:
def select_column(rank1, rank2):
    '''
    Compares two columns and returns the higher ranked column entry or combination 
    Accepts: str or null
    Returns: str, list of str, or null
    '''
    if rank1 == rank2:
        return rank1
    elif pd.notnull(rank1) and pd.notnull(rank2):
        return [rank1, rank2]
    elif pd.notnull(rank1):
        return rank1
    else: 
        return rank2

### Place ID source data, required for Google Place API queries

In [None]:
place_id = pd.read_csv('./files/place_ids.csv', dtype=str)
place_id.shape

In [None]:
# Examine missingness among websites
place_id.isnull().sum()

### Gathering websites from Google Place API

In [None]:
# API key obtained via Google Cloud Console under project gcp-gu-ppalab
# local_file = '/Users/sahithi/Documents/school/grad/brodnax/adari_places_auth.txt'
local_file = '/Users/nb775/auth/brodnax_places_auth.txt'
with open(local_file) as txtfile:
    my_key = txtfile.read().strip('\n')
# print("API Key: " + my_key)

In [None]:
# Setting the urls for the secord part of the API
web_url = 'https://maps.googleapis.com/maps/api/place/details/json?place_id='
web_param = '&fields=name%2Cwebsite&key=' + my_key

In [None]:
# Generating a unique url for each place_id in order to feed that into the API
only_pid = list(place_id['g_pid'])
web_api = []
for pid in only_pid:
    if pid:
        web_api.append(web_url + pid + web_param)
    else:
        web_api.append(None)

### Speed comparison for different approaches

In [None]:
results = []
for link in tqdm(web_api[:10]):
    results.append(url_request(link))
print(results)

In [None]:
results = session_request(web_api[:10])
print(results)

### API requests and formatting

In [None]:
web_list = session_request(web_api)

In [None]:
# Grabbing school names (to ensure match) and websites and adding them to the dataframe
places_names = []
places_websites = []

for escuela in tqdm(web_list):
    if (escuela.get('status') == 'OK') == True:
        places_names.append(escuela.get('result').get('name'))
        places_websites.append(escuela.get('result').get('website'))
    else:
        places_names.append(None)
        places_websites.append(None)

In [None]:
place_id['places_sch'] = places_names
place_id['places_website'] = places_websites
place_id.head()

In [None]:
place_id.isnull().sum()

In [None]:
place_id['final_website'] = place_id.progress_apply(lambda x: select_column(x.places_website, x.WEBSITE), axis=1)

In [None]:
place_id.isnull().sum()

In [None]:
place_id.head()

### Merging in place IDs for schools that initially returned errors

In [None]:
# Import only the columns that match the current dataset
place_id.columns
place_id_errors = pd.read_csv('./files/school_websites_from_errors.csv',
                             usecols=list(place_id.columns))
place_id_errors.head()

In [None]:
school_websites = pd.concat([place_id, place_id_errors], axis=0)
school_websites = school_websites.explode('final_website').drop_duplicates(ignore_index=True)
school_websites.shape

### Exporting the final dataset of websites

In [None]:
school_websites.to_csv('./files/school_websites.csv', index=False)

In [None]:
len(set(school_websites['final_website']))