In [695]:
import requests
import pandas as pd
import re
import numpy as np
import time
from datetime import date
import googlemaps
from string import printable
import re
import os

### Purpose: Clean the scraped bagel data, and get longitude and latitude data from Google API 

In [669]:
# Load scraped bagel data

bagel_data = pd.read_pickle('20210205_bagel_nyc_data_10.pkl')
bagel_data_clean = bagel_data.copy()

In [671]:
# Extract phone number from column

bagel_data_clean['phone'] = bagel_data_clean['phone']\
.str\
.replace(" ","-")\
.str\
.slice(start = 0, stop = 14)

# Remove any any symbols in the phone number

bagel_data_clean['phone'] = bagel_data_clean['phone'].str.replace('[^0-9]',"")

# Get phone number length

bagel_data_clean['phone_count'] = bagel_data_clean['phone'].str.len()

# If any phone number length is not equal to 10 (indicating a wrong phone number), replace with NaN

bagel_data_clean['phone']= np.where(bagel_data_clean['phone_count'] != 10, np.nan,bagel_data_clean['phone'])

# Extract food type information from column

bagel_data_clean['food_type'] = bagel_data_clean['food_type'].str.replace('$', '')

# Transform review count to numeric

bagel_data_clean['review_count'] = pd.to_numeric(bagel_data_clean['review_count'])

In [672]:
# Remove any advertisement rows, which are rows that don't start with a rank number from 1-30

bagel_data_clean['name_filter'] = pd.to_numeric(bagel_data_clean["name"].str.slice(start=0,stop=1), errors = 'coerce')
bagel_data_clean['name'] = bagel_data_clean['name'].str.replace('\d+\.', '')

In [673]:
# Remove invisible characters from name 

bagel_data_clean['name'] = bagel_data_clean['name'].str.encode('ascii', 'ignore').str.decode('ascii')

In [674]:
# Drop duplicates of datasets based on name of the bagel shop and the address.
# Choose the observation with the highest review_count to break ties between duplicates

final_bagel_data = bagel_data_clean[bagel_data_clean.name_filter.notnull()]\
.drop(['zip_search','name_filter','phone_count'], axis=1)\
.sort_values(['review_count'])\
.drop_duplicates(subset=['name','address'], keep='last')\
.reset_index(drop = True)

In [675]:
# Write data to csv file

final_bagel_data.to_csv("final_bagel_data.csv", index = False)

### Get Google Geolocation information for each observation

In [676]:
# Input Google API key for Google Geolocation information

gmaps = googlemaps.Client(key=os.environ['GOOGLE_API'])

In [677]:
# Create a column with information that will be used to searched for in the Google API

final_bagel_data['search'] = final_bagel_data['name'].fillna("") + " " + \
final_bagel_data['address'].fillna("") + " " + \
final_bagel_data['town'].fillna("")

In [678]:
# Apply google maps API

final_bagel_data["loc"] = final_bagel_data["search"].apply(gmaps.geocode)

In [679]:
# Save a version of the dataset with the Google Map data so that we don't have to rerun

final_bagel_data.to_pickle("final_bagel_data_gmaps_api.pkl")

In [689]:
# Additional data cleaning to separate loc column into separate columns

final_bagel_data['formatted_address_gmap']= final_bagel_data["loc"].apply(lambda loc: loc[0]['formatted_address'] if loc else None)
final_bagel_data['lat_gmap']= final_bagel_data["loc"].apply(lambda loc: loc[0]['geometry']['location']['lat'] if loc else None)
final_bagel_data['lng_gmap']= final_bagel_data["loc"].apply(lambda loc: loc[0]['geometry']['location']['lng'] if loc else None)

In [690]:
# Drop additional name and formatted address duplicates (sometimes Yelp has the same store but with slightly different addresses)
# Drop the addresses that Google couldn't find information for

final_bagel_data_geo = final_bagel_data[final_bagel_data.formatted_address_gmap.str.contains('NY', na=False) == True]\
.sort_values(['review_count'])\
.drop_duplicates(subset=['name','formatted_address_gmap'], keep='last')\
.reset_index(drop = True)

In [691]:
# Save the data

final_bagel_data_geo.to_pickle("final_bagel_data_geo.pkl")
final_bagel_data_geo.to_csv("final_bagel_data_geo.csv", index = False)