## **00 Preprocess Brewery List**
The list of breweries from Open Brewery DB contains a lot of missing latitude, longitude information which is necessary for mapping purposes. This script takes the address field, strips additional information (Suite, Bldg, etc.) that confuses geocoders, then searches for lat / lon information using the cleaned address.

### **Notebook Objectives**
1. Clean address fields using regular expressions
2. Search for missing lat / lon info using the cleaned address and geocoder API
3. Export cleaned csv containing the updated address and lat / lon info 

In [None]:
from pathlib import Path
from dotenv import dotenv_values
import requests
import time
import pandas as pd
import re
import logging

logging.basicConfig(filename='address.log', filemode='w', 
        level=logging.DEBUG, force=True,
        format='%(asctime)s %(levelname)s:%(message)s')

config = dotenv_values(dotenv_path=Path('../.env'))

In [None]:
def geocode(address):
    '''Get lat/lon from address'''
    params = { 'format': 'json', 
               'addressdetails': 1, 
               'q': address}
    headers = { 'user-agent'   : config['USER_AGENT'] } 
    return requests.get('http://nominatim.openstreetmap.org/search', 
                        params=params, headers=headers)

def clean_address(inputpath, outputpath, start_index=0):
    '''Clean street address, then search for missing lat/lon'''
    df = pd.read_csv(inputpath)
    df['street_clean'] = df['street'].apply(lambda x: re.sub(r"Ste [A-Za-z0-9\-]+", '', str(x)))
    df['street_clean'] = df['street_clean'].apply(lambda x: re.sub(r"Unit [A-Za-z0-9\-]+", '', str(x)))
    df['street_clean'] = df['street_clean'].apply(lambda x: re.sub(r"# [A-Za-z0-9\-]+", '', str(x)))
    df['street_clean'] = df['street_clean'].apply(lambda x: re.sub(r"Bldg [A-Za-z0-9\-]+", '', str(x)))
    df['street_clean'] = df['street_clean'].apply(lambda x: re.sub(r"\(Route [A-Za-z0-9\-]+\)", '', str(x)))
    df['street_clean'] = df['street_clean'].apply(lambda x: re.sub(r"Suite [A-Za-z0-9\-]+", '', str(x)))
    # Clean missing lat/lon
    for index, row in df[start_index:].iterrows():
        if pd.isnull(row['longitude']):
            logging.info(f"Searching address for {index} {row['name']}...")
            response = geocode(f"{row['street_clean']}, {row['city']} {row['state']}")
            if response.json():
                logging.info(f"Lat lon found for {index} {row['name']}")
                df.at[index, 'latitude'] = response.json()[0]['lat']
                df.at[index, 'longitude'] = response.json()[0]['lon']
            time.sleep(4)
    # Export cleaned table
    df.to_csv(outputpath, index=False)
    return df

In [None]:
# Clean missing lat/lon in brewery list
input = Path('../assets/breweries.csv')
output = Path('../assets/breweries_clean_address.csv') 
clean_address(input, output)