In [1]:
#import dependencies
import pandas as pd
import numpy as np
import sqlalchemy as db
import config
import requests
import csv
import googlemaps


from pathlib import Path
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, Integer, Table, Column, MetaData
from sqlalchemy.orm import Session
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# Create SQLite db
restaurants_db = "restaurants.sqlite"
connector = (f"sqlite:///{restaurants_db}")

# Create engine
engine = create_engine(connector)

# Declare a Base using `automap_base()`
Base = automap_base()

# Use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)

# Print all of the classes (tables) mapped to the Base
Base.classes.keys()

# Create the inspector and connect it to the engine
inspector = inspect(engine)

In [3]:
def verify_inserts(tablename):
    # Using the inspector to print the column names within the 'restaurants' table and its types
    columns = inspector.get_columns(tablename)
    for column in columns:
        print(column["name"], column["type"])

    # Create metadata access to the object
    meta_data = db.MetaData(bind=engine)
    db.MetaData.reflect(meta_data)

    # Get the table from the metadata object
    restaurants = meta_data.tables['Restaurants']

    # Select and print rowcount
    result = db.select([db.func.count()]).select_from(restaurants).scalar()
    print("Row Count:", result)

In [4]:
def geocode(add):
    g = gmaps_key.geocode(add)
    lat = g[0]["geometry"]["location"]["lat"]
    lng = g[0]["geometry"]["location"]["lng"]
    return (lat, lng)

## Load and clean yelp_business.csv

In [5]:
def etl_restaurants():
    # Load business into df
    business_path = Path('../data_zipped/yelp_business.csv.zip', low_memory=False)
    business_df = pd.read_csv(business_path)

    # Filter businesses to restaurants only
    business_df = business_df[business_df["categories"].str.contains("Restaurant")]
    
    # Drop rows where is_open = 0
    business_df = business_df[business_df.is_open == 0]

    # Add flag "is_mexican_restaurant"
    business_df["is_mexican_restaurant"] = np.where(business_df.categories.str.contains("Mex"),1,0)

    # Remove is_open due, neighborhood, and categories
    business_df = business_df.drop(['is_open', 'neighborhood', 'categories'], axis=1)

    # Fill missing values with empty string
    business_df = business_df.fillna('')

    # Write to SQLite db
    business_df.to_sql('Restaurants', engine, if_exists='replace')
    
    return business_df


## Load and clean Reviews

In [6]:
def etl_reviews(df):
# Load comments into df
    reviews_path = Path('../data_zipped/yelp_tip.csv.zip', low_memory=False)
    reviews_df = pd.read_csv(reviews_path)

    # Drop rows with missing values
    reviews_df = reviews_df.dropna(axis=0,how="any")

    # Merge with business_df to filter data set to relavent rows
    reviews_df = business_df.set_index('business_id').join(reviews_df.set_index('business_id'), rsuffix='_review')

    # Eliminate unneeded columns
    reviews_df = reviews_df.loc[:,"text":"user_id"]

    # Write to database
    reviews_df.to_sql('Reviews', engine, if_exists='replace')

    # All done, release memory
    del reviews_df

## Load and clean yelp_business_attributes.csv

In [7]:
def etl_restaurant_attributes(df):
    # Load business attributes into df
    attributes_path = Path('../data_zipped/yelp_business_attributes.csv.zip', low_memory=False)
    attributes_df = pd.read_csv(attributes_path)

    # Merge with business_df to filter data set to relavent rows
    business_df = df.set_index('business_id').join(attributes_df.set_index('business_id'), rsuffix='_attr')

    # Release memory
    del attributes_df

    # Eliminate columns with only one distinct value
    for col in business_df:
        if(business_df.loc[:,col].nunique()) == 1:
            business_df.pop(col)

    # Remove HairSpecializesIn_coloring column as it doesn't apply
    business_df = business_df.drop(['HairSpecializesIn_coloring'], axis=1)

    # Send entire business_df to csv for ML; ML workflow will take care of encoding and imputation from here
    business_df.to_csv('../data_zipped/restaurants.csv')

    # Eliminate business_df elements
    attribute_df = business_df.loc[:,"BusinessAcceptsCreditCards":"AgesAllowed"]

    # Drop rows with no attributes
    attribute_df = attribute_df.dropna(axis=0,how="any")

    # Write to SQLite db
    attribute_df.to_sql('RestaurantAttributes', engine, if_exists='replace')

## Clean and load business data for mexican_restaurants.csv

In [8]:
def etl_mex_restaurants(df, gmaps_key, dbg):
    # Create a formatted address string for geocoding
    mex_restaurants_df['full_address'] = mex_restaurants_df[['address', 'city', "state",'postal_code']].agg(', '.join, axis=1).str.replace('"','')

    # Run the address through the geocoder function and apply the results to a new column called "geocoded"
    mex_restaurants_df['geocoded'] = mex_restaurants_df['full_address'].apply(geocode)

    # # Apply the new geocodes to the restaurants dataframe
    mex_restaurants_df[['latitude', 'longitude']] = pd.DataFrame(mex_restaurants_df['geocoded'].tolist(), index=df.index)

    # If not a debug run save the results to a csv file for BI
    if dbg == "N": 
        mex_restaurants_df.to_csv("mexican_restaurants.csv")
        
    return mex_restaurants_df.head(100)

In [9]:
# Run Restaurants ETL
business_df = etl_restaurants()

# Validate success
verify_inserts('Restaurants')

index BIGINT
business_id TEXT
name TEXT
address TEXT
city TEXT
state TEXT
postal_code TEXT
latitude FLOAT
longitude FLOAT
stars FLOAT
review_count BIGINT
is_mexican_restaurant INTEGER
Row Count: 14225


In [10]:
# Run Reviews ETL
etl_reviews(business_df)

# Validate success
verify_inserts('Reviews')

business_id TEXT
text TEXT
date TEXT
likes FLOAT
user_id TEXT
Row Count: 14225


In [11]:
# Run RestaurantAttributes ETL
etl_restaurant_attributes(business_df)

# Validate success
verify_inserts('RestaurantAttributes')

business_id TEXT
BusinessAcceptsCreditCards TEXT
BusinessParking_garage TEXT
BusinessParking_street TEXT
BusinessParking_validated TEXT
BusinessParking_lot TEXT
BusinessParking_valet TEXT
GoodForKids TEXT
WheelchairAccessible TEXT
BikeParking TEXT
Alcohol TEXT
HasTV TEXT
NoiseLevel TEXT
RestaurantsAttire TEXT
Music_dj TEXT
Music_background_music TEXT
Music_no_music TEXT
Music_karaoke TEXT
Music_live TEXT
Music_video TEXT
Music_jukebox TEXT
Ambience_romantic TEXT
Ambience_intimate TEXT
Ambience_classy TEXT
Ambience_hipster TEXT
Ambience_divey TEXT
Ambience_touristy TEXT
Ambience_trendy TEXT
Ambience_upscale TEXT
Ambience_casual TEXT
RestaurantsGoodForGroups TEXT
Caters TEXT
WiFi TEXT
RestaurantsReservations TEXT
RestaurantsTakeOut TEXT
HappyHour TEXT
GoodForDancing TEXT
RestaurantsTableService TEXT
OutdoorSeating TEXT
RestaurantsDelivery TEXT
BestNights_monday TEXT
BestNights_tuesday TEXT
BestNights_friday TEXT
BestNights_wednesday TEXT
BestNights_thursday TEXT
BestNights_sunday TEXT
Be

In [14]:
# Dbg flag indicates if a debug execution or not, and will conditionally skip certain tasks both in this cell and within the geocode function
# This is so that the code can be executed in "demo" mode and will reduce Google API calls to avoid charges
dbg = "Y"

# Set API key from config file
gmaps_key = googlemaps.Client(key=config.API_KEY)

# Filter input df to only geocode Mexican Restaurants to save on API calls
mex_restaurants_df = business_df[business_df["is_mexican_restaurant"] == 1]

# For "demo" mode only
if dbg == 'Y':
    mex_restaurants_df = mex_restaurants_df.head()

etl_mex_restaurants(mex_restaurants_df, gmaps_key, dbg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mex_restaurants_df['full_address'] = mex_restaurants_df[['address', 'city', "state",'postal_code']].agg(', '.join, axis=1).str.replace('"','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mex_restaurants_df['geocoded'] = mex_restaurants_df['full_address'].apply(geocode)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_mexican_restaurant,full_address,geocoded
313,WEouNHHxfqGa8gYAnyiyBA,"""El Palenque""","""5945 Andrews Rd""",Mentor-on-the-Lake,OH,44060,41.709322,-81.359461,3.5,3,1,"5945 Andrews Rd, Mentor-on-the-Lake, OH, 44060","(41.7093216, -81.35946129999999)"
478,to2cGMKdaw7ZHbXMzpfhlA,"""Filiberto's Mexican Rest""","""3218 E Mcdowell Rd""",Phoenix,AZ,85008,33.466004,-112.012286,3.0,7,1,"3218 E Mcdowell Rd, Phoenix, AZ, 85008","(33.46600430000001, -112.0122861)"
868,kiU9C58n7dgygdrpjOQXMQ,"""Los Jarochos Restaurant""","""4811 S Rainbow Blvd""",Las Vegas,NV,89147,36.101020,-115.244312,4.0,62,1,"4811 S Rainbow Blvd, Las Vegas, NV, 89147","(36.1010205, -115.2443122)"
951,Ka3z0iudeviL2nEKZ4BeYg,"""Tijuana Flats""","""1608 E Blvd""",Charlotte,NC,28203,35.199318,-80.841358,4.0,50,1,"1608 E Blvd, Charlotte, NC, 28203","(35.1993181, -80.8413576)"
962,v0KawuAO7clhXCDKobFs0g,"""Currito - Burritos Without Borders""","""Pittsburgh International Airport""",Pittsburgh,PA,15231,40.495309,-80.235291,3.0,71,1,"Pittsburgh International Airport, Pittsburgh, ...","(40.495309, -80.2352913)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16224,Q28MO9hEMyG0xrAnOF-dSw,"""Stand Up Scottsdale Comedy Club""","""5101 N Scottsdale Rd""",Scottsdale,AZ,85251,33.510801,-111.925514,4.5,190,1,"5101 N Scottsdale Rd, Scottsdale, AZ, 85251","(33.5108012, -111.9255139)"
16228,5UaVi_cwnoAvIBp0g4aF2Q,"""Birria Estilo Michoacan""","""906 N 15th Ave""",Phoenix,AZ,85007,33.457929,-112.091487,4.0,5,1,"906 N 15th Ave, Phoenix, AZ, 85007","(33.4579287, -112.0914873)"
16330,5KWJZHhG88echvquDBu4iw,"""Tronco Sonoran Grill""","""10155 E Via Linda, Unit 135""",Scottsdale,AZ,85258,33.574430,-111.862215,3.0,45,1,"10155 E Via Linda, Unit 135, Scottsdale, AZ, 8...","(33.5744296, -111.862215)"
16377,zY_YAwo4On0aqOABS-1cBA,"""Rockin' Taco Mexican Grill""","""3717 S Las Vegas Blvd, Ste 260""",Las Vegas,NV,89109,36.108842,-115.172149,3.5,63,1,"3717 S Las Vegas Blvd, Ste 260, Las Vegas, NV,...","(36.1088418, -115.1721488)"
