## **Build the function to extract the zip codes through the API response dictionaries**

In [8]:
import pandas as pd
import requests
import time

# Loading through the raw URL in GitHub

url = 'https://raw.githubusercontent.com/frankrgamboa/miami-inspections-cleaning/refs/heads/main/solo_address.csv'
df = pd.read_csv(url)

# Create this function to extract zip codes through the API

def get_zip(address, api_key):
    # Geocoding endpoint in JSON format
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"

    params = {
        "address": address,

        "key": api_key, # The author's API key has been removed for privacy reasons :)

        "components": "country:US|administrative_area:FL"  # Just Florida!
    }

    try:
        # Response to the request we made to the endpoint with the defined parameters
        response = requests.get(base_url, params=params)

        # Convert the JSON response into a Python dictionary object
        response_dict = response.json()

        if response_dict["status"] == "OK":   # We are only interested in the case of coincidences

            for components in response_dict["results"][0]["address_components"]:
               # Examine the list of "address_components" dictionaries with the `for` loop,
               # looking for the one whose value is "postal_code" for the "types" key.

                if "postal_code" in components["types"]:

                    return components["long_name"]

        return None

    except Exception as exc:
        print(f"Error with the {address}: {exc}")
        return None




## **Use the built-in function to extract zip codes with specific addresses and parameters**

In [9]:
# Step 1: Unique addresses to avoid repeated queries to the API

unique_addresses = df["address"].dropna().unique()
address_zip_map = {}

# The author quota on Google Cloud allows up to 3000 requests per second,
# we set the time.sleep() parameter to 0.0006, which means pausing queries
# every 0.0006 seconds, at a rate of 1/0.0006 = 1666 < 3000 requests per second,
# avoiding saturating the server

block_size = 100
pause_seconds = 0.0006

# Step 2: These queries will be made 100 at a time to control the process flow
# and avoid repeating queries in case of interruptions

for start in range(0, len(unique_addresses), block_size):
    end = start + block_size # End of `block_size` jump size
    block = unique_addresses[start:end]

    for addr in block:

        # Getting the zip code through our function 'get_zip'
        zip_code = get_zip( address = addr, api_key = 'Enter your API key here!')

        # Map the zip code with the address corresponding to the dictionary
        address_zip_map[addr] = zip_code

        # Pause between requests of 0.0006 seconds
        time.sleep(pause_seconds)

    print(
        f"Success: {min(end, len(unique_addresses))} of {len(unique_addresses)} addresses processed"
     )


# Need to recreate a dataframe with the mapped dictionary values, the zip codes
# we retrieved, and the addresses. We can't actually just add the zip codes extracted
# from the dictionary values as a new column in the original dataframe because
# they don't have the same dimensions, since we're only querying for unique addresses

zip_df = pd.DataFrame( list(address_zip_map.items()) , columns=["address", "zip"] )
df = df.merge(zip_df, on="address", how="left")

Success: 100 of 2667 addresses processed
Success: 200 of 2667 addresses processed
Success: 300 of 2667 addresses processed
Success: 400 of 2667 addresses processed
Success: 500 of 2667 addresses processed
Success: 600 of 2667 addresses processed
Success: 700 of 2667 addresses processed
Success: 800 of 2667 addresses processed
Success: 900 of 2667 addresses processed
Success: 1000 of 2667 addresses processed
Success: 1100 of 2667 addresses processed
Success: 1200 of 2667 addresses processed
Success: 1300 of 2667 addresses processed
Success: 1400 of 2667 addresses processed
Success: 1500 of 2667 addresses processed
Success: 1600 of 2667 addresses processed
Success: 1700 of 2667 addresses processed
Success: 1800 of 2667 addresses processed
Success: 1900 of 2667 addresses processed
Success: 2000 of 2667 addresses processed
Success: 2100 of 2667 addresses processed
Success: 2200 of 2667 addresses processed
Success: 2300 of 2667 addresses processed
Success: 2400 of 2667 addresses processed
S

## **Obstacle detection in the geocoding process**

In [10]:
# Analyze all non-null addresses that did not report zip codes after the geocoding process

df.loc[ df['zip'].isna()== True, 'address' ].value_counts()

Unnamed: 0_level_0,count
address,Unnamed: 1_level_1
1 MIA NORTH TERM D FOOD COURT,27
1 MIAD CC E SAT 4 FLOOR,26
"1 MIAD LWR E ,SAT E",22
1 MIAD TERMINAL E-2ND LEVEL,13
1 MIA BLDG 874,9
1 MIA BLDG 863,7
1 MIA PARK 6 BLDG 3109,7
1 MIA BLDG 704,6
14728 JEFFERSON ST,5
450 NW 1 AVE 100,5


In [13]:
# 52 different addresses are failing to obtain the zip code from the process we followed.
# In the table above, we can see that the first 12 accounts for 135 of the 190 missing
# records in the zip code column across all inspections. We will manually obtain the zip
# code for these first 12 addresses, which represent only 23% of all those that failed,
# but they account for 71% of the missing zip code records. Upon completion, we will
# have only 40 (different) addresses out of a total of 2,667 (different) addresses
# without a zip code assigned, which is only 1.49% of the missing records. And 57
# missing zip code records out of the total 7,844 inspections (55 that remained
# from the geocoding process plus two null addresses we had), which represents
# 0.7% of the missing zip codes out of the total inspections.


# All addresses within Miami International Airport were assigned zip code 33142
# despite several zip codes converging in this area

relevant_address = { '1 MIA NORTH TERM D FOOD COURT' : '33142',
                     '1 MIA BLDG 874' : '33142',
                     '1 MIAD CC E SAT 4 FLOOR' :'33142',
                     '1 MIAD LWR E ,SAT E' : '33142',
                     '1 MIAD TERMINAL E-2ND LEVEL' : '33142',
                     '1 MIA BLDG 863' : '33142',
                     '1 MIA PARK 6 BLDG 3109' : '33142',
                     '1 MIA BLDG 704' : '33142',
                     '14728 JEFFERSON ST' : '33176',
                     '450 NW 1 AVE 100' : '33128',
                     '7829 MILLER RD B' : '33146',
                     '18925 NE 21 AVE' : '33179',
}



for key, value in relevant_address.items():
    df.loc[ df['address'] == key, 'zip' ] = value


# Saving the modified file with zip codes included

df.to_csv('address_ZIP.csv', index = False )