# Install and import key Libraries

In [None]:
# pip install geopandas
# pip install geopy
# pip install folium

In [166]:
import pandas as pd
#import geopandas as gpd
import geopy
#from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import folium
from folium.plugins import FastMarkerCluster
#!/usr/bin/env python3
import csv
import time
from csv import Dialect

from geopy.exc import (
    GeocoderQueryError,
    GeocoderQuotaExceeded,
    ConfigurationError,
    GeocoderParseError,
    GeocoderTimedOut
)

# Geocode input real-estate listings through Google API

In [135]:
path = "ADD DIRECTORY PATH"

In [136]:
# used to set a google geocoding query by merging this value into one string with comma separated
#ADDRESS_COLUMNS_NAME = ["name", "addressline1", "town"]
ADDRESS_COLUMNS_NAME = ["street", "city", "state", "zip_code"]
# used to define component restrictions for google geocoding
COMPONENT_RESTRICTIONS_COLUMNS_NAME = {}

# appended columns name to processed data csv
NEW_COLUMNS_NAME = ["Lat", "Long", "Error", "formatted_address", "location_type"]

# delimiter for input csv file
DELIMITER = ","

# Automatically retry X times when GeocoderErrors occur (sometimes the API Service return intermittent failures).
RETRY_COUNTER_CONST = 5

# name for output csv file
INPUT_CSV_FILE = path + "/NewYork.csv"

# name for output csv file
OUTPUT_CSV_FILE = path + "/NewYork-geocoded.csv"

# google keys - see https://dev.to/gaelsimon/bulk-geocode-addresses-using-google-maps-and-geopy-5bmg for more details
GOOGLE_API_KEY = "ADD API KEY"  # it's the new mandatory parameter

# dialect to manage different format of CSV
class CustomDialect(Dialect):
    delimiter = DELIMITER
    quotechar = '"'
    doublequote = True
    skipinitialspace = True
    lineterminator = '\n'
    quoting = csv.QUOTE_ALL


csv.register_dialect('ga', CustomDialect)


In [137]:
def process_addresses_from_csv():
    geo_locator = GoogleV3(api_key=GOOGLE_API_KEY)

    with open(INPUT_CSV_FILE, 'r') as csvinput:
        with open(OUTPUT_CSV_FILE, 'w') as csvoutput:

            # new csv based on same dialect as input csv
            writer = csv.writer(csvoutput, dialect="ga")

            # create a proper header with stripped fieldnames for new CSV
            header = [h.strip('"').strip() for h in next(csvinput).split(DELIMITER)]
            # read Input CSV as Dict of Dict
            reader = csv.DictReader(csvinput, dialect="ga", fieldnames=header)

            # append new columns, to receive geocoded information, to the header of the new CSV
            header = list(reader.fieldnames)
            for column_name in NEW_COLUMNS_NAME:
                header.append(column_name)
            writer.writerow([s.strip() for s in header])

            # iterate through each row of input CSV
            for record in reader:
                # build a line address based on the merge of multiple field values to pass to Google Geocoder
                line_address = ','.join(
                    str(val) for val in (record[column_name] for column_name in ADDRESS_COLUMNS_NAME))
                #line_address = ADDRESS_COLUMNS_NAME
                
                # if you want to use componentRestrictions feature,
                # build a matching dict {'googleComponentRestrictionField' : 'yourCSVFieldValue'}
                # to pass to Google Geocoder
                component_restrictions = {}
                if COMPONENT_RESTRICTIONS_COLUMNS_NAME:
                    for key, value in COMPONENT_RESTRICTIONS_COLUMNS_NAME.items():
                        component_restrictions[key] = record[value]

                # geocode the built line_address and passing optional componentRestrictions
                location = geocode_address(geo_locator, line_address, component_restrictions)

                # build a new temp_row for each csv entry to append to process_data Array
                # first, append existing fieldnames value to this temp_row
                temp_row = [record[column_name] for column_name in reader.fieldnames]
                # then, append geocoded field value to this temp_row
                for column_name in NEW_COLUMNS_NAME:
                    try:
                        temp_row.append(location[column_name])
                    except BaseException as error:
                        print(error)
                        temp_row.append('')

                # Finally append your row with geocoded values with csvwriter.writerow(temp_row)
                try:
                    writer.writerow(temp_row)
                except BaseException as error:
                    print(error)
                    print(temp_row)



In [138]:
def geocode_address(geo_locator, line_address, component_restrictions=None, retry_counter=1):
    time.sleep(1)
    try:
        # the geopy GoogleV3 geocoding call
        location = geo_locator.geocode(line_address, components=component_restrictions)

        if location is not None:
            # build a dict to append to output CSV
            location_result = {"Lat": location.latitude, "Long": location.longitude, "Error": "",
                               "formatted_address": location.raw['formatted_address'],
                               "location_type": location.raw['geometry']['location_type']}
        else:
            location_result = {"Lat": 0, "Long": 0,
                               "Error": "None location found, please verify your address line",
                               "formatted_address": "",
                               "location_type": ""}

    # To catch generic geocoder errors.
    except (ValueError, GeocoderQuotaExceeded, ConfigurationError, GeocoderParseError) as error:
        if hasattr(error, 'message'):
            error_message = error.message
        else:
            error_message = error
        location_result = {"Lat": 0, "Long": 0, "Error": error_message, "formatted_address": "", "location_type": ""}

    # To retry because intermittent failures and timeout sometimes occurs
    except (GeocoderTimedOut, GeocoderQueryError) as geocodingerror:
        if retry_counter < RETRY_COUNTER_CONST:
            return geocode_address(geo_locator, line_address, component_restrictions, retry_counter + 1)
        else:
            if hasattr(geocodingerror, 'message'):
                error_message = geocodingerror.message
            else:
                error_message = geocodingerror
            location_result = {"Lat": 0, "Long": 0, "Error": error_message, "formatted_address": "",
                               "location_type": ""}
    # To retry because intermittent failures and timeout sometimes occurs
    except BaseException as error:
        if retry_counter < RETRY_COUNTER_CONST:
            time.sleep(5)
            return geocode_address(geo_locator, line_address, component_restrictions, retry_counter + 1)
        else:
            location_result = {"Lat": 0, "Long": 0, "Error": error, "formatted_address": "",
                               "location_type": ""}

    print("address line     : {0}".format(line_address))
    print("geocoded address : {0}".format(location_result["formatted_address"]))
    print("location type    : {0}".format(location_result["location_type"]))
    print("Lat/Long         : [{0},{1}]".format(location_result["Lat"], location_result["Long"]))
    print("-------------------")

    return location_result


if __name__ == '__main__':
    process_addresses_from_csv()

address line     : 350 E 81st St Apt 3,New York City,New York,10028
geocoded address : 350 E 81st St #3, New York, NY 10028, USA
location type    : ROOFTOP
Lat/Long         : [40.7737326,-73.9524453]
-------------------
address line     : 25 N Moore St Unit 16ABC,New York City,New York,10013
geocoded address : 25 N Moore St, New York, NY 10013, USA
location type    : ROOFTOP
Lat/Long         : [40.7199482,-74.00710579999999]
-------------------
address line     : 5900 Arlington Ave Apt 2M,New York City,New York,10471
geocoded address : 5900 Arlington Ave #2m, The Bronx, NY 10471, USA
location type    : ROOFTOP
Lat/Long         : [40.9090953,-73.90564359999999]
-------------------
address line     : 50 Sutton Pl S Apt 8K,New York City,New York,10022
geocoded address : 50 Sutton Pl S #8k, New York, NY 10022, USA
location type    : ROOFTOP
Lat/Long         : [40.75595029999999,-73.9621113]
-------------------
address line     : 215 E 96th St Unit 37F,New York City,New York,10128
geocoded 

In [155]:
geo_locator = GoogleV3(api_key=GOOGLE_API_KEY)
geo_locator.geocode("New York City, USA")

Location(New York, NY, USA, (40.7127753, -74.0059728, 0.0))

In [159]:
df = pd.read_csv(path + '/NewYork-geocoded.csv')
df.head()

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date,Lat,Long,formatted_address,location_type
0,for_sale,3600000,3.0,3.0,,"350 E 81st St Apt 3, New York City, NY, 10028",350 E 81st St Apt 3,New York City,New York,10028,2338.0,,40.773733,-73.952445,"350 E 81st St #3, New York, NY 10028, USA",ROOFTOP
1,for_sale,22000000,5.0,6.0,,"25 N Moore St Unit 16ABC, New York City, NY, 1...",25 N Moore St Unit 16ABC,New York City,New York,10013,7020.0,10/6/2015,40.719948,-74.007106,"25 N Moore St, New York, NY 10013, USA",ROOFTOP
2,for_sale,210000,1.0,1.0,,"5900 Arlington Ave Apt 2M, New York City, NY, ...",5900 Arlington Ave Apt 2M,New York City,New York,10471,,,40.909095,-73.905644,"5900 Arlington Ave #2m, The Bronx, NY 10471, USA",ROOFTOP
3,for_sale,775000,1.0,2.0,,"50 Sutton Pl S Apt 8K, New York City, NY, 10022",50 Sutton Pl S Apt 8K,New York City,New York,10022,,,40.75595,-73.962111,"50 Sutton Pl S #8k, New York, NY 10022, USA",ROOFTOP
4,for_sale,749000,1.0,1.0,,"215 E 96th St Unit 37F, New York City, NY, 10128",215 E 96th St Unit 37F,New York City,New York,10128,,,40.785188,-73.947968,"One Carnegie Hill, 215 E 96th St #37f, New Yor...",ROOFTOP


In [160]:
df.state.unique()

array(['New York'], dtype=object)

In [164]:
df.Lat.isnull().sum()

0

In [162]:
#df = df[pd.notnull(df["Lat"])]

# Create clustering map

In [167]:
folium_map = folium.Map(location=[40.7127281,-74.0060152],
                        zoom_start=12,
                         tiles='cartodbpositron')
                        #tiles='CartoDB dark_matter')


FastMarkerCluster(data=list(zip(df['Lat'].values, df['Long'].values))).add_to(folium_map)

folium.LayerControl().add_to(folium_map)

folium_map


In [168]:
folium_map.save('C:\Kaggle\Real Estate/mymap.html') 