In [18]:
import os
import re
from collections import Counter
from collections.abc import Generator
from datetime import time
from heapq import nlargest
from itertools import chain
from pathlib import Path
from string import punctuation

import numpy as np
import pandas as pd
import spacy
from bs4 import BeautifulSoup
from googlemaps import Client
from spacy.lang.en.stop_words import STOP_WORDS

# Step 1:
Given static HTML files under /data/, scrape and process the necessary data and create a single DataFrame.

In [19]:
data = Path('data')
num_rgx = re.compile(r'\d+\.\s') # Standardize names: '4. Miller Union' -> 'Miller Union'

In [20]:
def build_atl_mag_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    """Extract restaurant information from https://www.atlantamagazine.com/50bestrestaurants/.
    
    Price levels are accurately listed on this website, so the field is filled.
    
    Args:
        soup (bs4.BeautifulSoup): Soup of ATL magazine's website
    
    Returns:
        Generator producing dicts representing a single restaurant. Each dict has the following fields:
            'name', 'description', 'price', 'search_keywords', 'website', 'multiple_mentions'.
    """
    for r in soup.select("div.restLRContainer"):
        price = r.select_one("div.price")
        span = price.select_one("span")
        if span:
            span.decompose()
        
        website = r.select_one("em > a")
        address = r.select_one("div.address").text
        city = r.select_one("div.neighbor").text

        yield {
            "name": re.sub(num_rgx, '', r.select_one("h2.restName").text),
            "description": summarize(r.select_one("div.restLeft > p").text.strip()),
            "price": price.text.count('$'),
            "search_keywords": f"{address}, {city} restaurant",
            "website": website['href'] if website else np.nan,
            "multiple_mentions": False,
        }

In [21]:
def build_atl_eater_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    """Extract resaurant information from https://atlanta.eater.com/maps/38-best-restaurants-in-atlanta.
    
    Price levels are by default filled with NaN, to be filled by get_places.
    Same args and return as build_atl_mag_restaurants.
    """
    ignore = ', '.join(f"[data-slug='{ignore}']" for ignore in ('intro', 'newsletter', 'related-links'))
    for r in soup.select(f"main > section.c-mapstack__card:not({ignore})"):
        website = r.select("div.c-mapstack__info > div.info")
        if website:
            website = website[2].select_one('div:not([class]) > a').get('href', np.nan) if len(website) == 3 else np.nan
        
        yield {
            "name": r.select_one("div.c-mapstack__card-hed > div > h1").text,
            "description": summarize(r.select_one("div.c-entry-content > p").text.strip()),
            "price": np.nan,
            "search_keywords": f'{r.select_one("div.c-mapstack__address > a").text} restaurant',
            "website": website,
            "multiple_mentions": False,
        }

In [22]:
def build_midtown_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    """Extract restaurant information from https://www.atlantaeats.com/blog/midtown-atlanta-restaurant-bucket-list/.
    
    Price levels are by default filled with NaN, to be filled by get_places.
    Same args and return as build_atl_mag_restaurants.
    """
    content = midtown.select_one("div.entry-content")

    titles = content.select("h2 > strong")[:-1]
    descriptions = content.select("p")[2:-2]
    if len(titles) != 10 and len(descriptions) != 10:
        exit("Incorrect scraping on midtown website.")

    for title, description in zip(titles, descriptions):
        website = title.select_one('a')
        yield {
            "name": re.sub(num_rgx, '', title.text),
            "description": summarize(description.text),
            "price": np.nan,
            "search_keywords": f"{title.text[3:]} midtown restaurant",
            "website": website['href'] if website else np.nan,
            "multiple_mentions": False,
        }

Text summarization algorithm inspired from [medium](https://medium.com/analytics-vidhya/text-summarization-using-spacy-ca4867c6b744)

Selects the 3 most important sentences from `text` based on word frequency.

In [23]:
def summarize(text: str, sentence_count: int = 3) -> str:
    """Return n most important sentences from given text.
    
    The sentences are selected based on word frequency with the overall
    text, and is meant to be used for summarizing restaurant descriptions.
    
    Args:
        text (str): Full text to be shortened.
        sentence_count (int) = 3: Number of sentences in the resulting summary.
        
    Returns:
        Summarized text.
    """
    doc = spacy.load('en_core_web_sm')(text)

    word_frequencies = Counter()
    for word in doc:
        word = word.text.lower()
        if word not in STOP_WORDS and word not in punctuation:
            word_frequencies[word] += 1

    max_frequency = word_frequencies.most_common(1)[0][1]

    for word in word_frequencies.values():
        word = word / max_frequency

    sentence_scores = Counter()
    for sent in doc.sents:
        for word in sent:
            word = word.text.lower()
            if word in word_frequencies:
                sentence_scores[sent] += word_frequencies[word]

    summary = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
    return ' '.join(word.text for word in summary)

In [24]:
with open(data / 'atl_mag.html') as f:
    mag = BeautifulSoup(f, 'html.parser')
with open(data / 'atl_eater.html') as f:
    eater = BeautifulSoup(f, 'html.parser')
with open(data / 'midtown.html') as f:
    midtown = BeautifulSoup(f, 'html.parser')

In [25]:
# DataFrame from 3 chained generator iterables
df = pd.DataFrame(
    chain(
        build_atl_mag_restaurants(mag),
        build_atl_eater_restaurants(eater),
        build_midtown_restaurants(midtown),
    )
)

In [26]:
df.dtypes

name                  object
description           object
price                float64
search_keywords       object
website               object
multiple_mentions       bool
dtype: object

In [27]:
# EDGE CASE: Slightly different names from atl_mag and atl_eater
df.loc[df['name'] == 'Spring Restaurant', 'name'] = 'Spring'

In [28]:
# Extract duplicate rows and flip value of 'multiple_mentions'
dups = df[df.duplicated('name', keep='last')].replace(False, True)
# Drop all duplicates from original dataframe
filtered = df.drop_duplicates(subset='name', keep=False)
# Append modified duplicates back to main dataframe
filtered = pd.concat([dups, filtered], axis=0, join='outer')

In [29]:
filtered['multiple_mentions']

1       True
2       True
3       True
5       True
7       True
       ...  
115    False
116    False
117    False
118    False
120    False
Name: multiple_mentions, Length: 103, dtype: bool

# Step 2:

Given a DataFrame of filtered restaurant information, augment the data with Google Maps and Places information.

Please see `README.md` for the meaning of all the columns.

In [30]:
gmaps = Client(key=os.environ["GOOGLE_API_KEY"])

In [31]:
def get_places(df: pd.DataFrame, gmaps: Client) -> pd.DataFrame:
    """Creates dataframe with place information.

    Given search keywords from scraping the website, this function hones into
    an exact location and produces restaurant information straight from Google.
    
    Args:
        df (pd.DataFrame): Restaurant information. Must have column 'search_keywords'.
        gmaps (googlemaps.Client): Google API client object.
        
    Returns:
        DataFrame with columns "place_id", "price_level", "rating",
            "user_ratings_total", and "formatted_address".
    """
    # Dataframe from dict of lists
    columns = {
        "place_id": [],
        "price_level": [],
        "rating": [],
        "user_ratings_total": [],
        "formatted_address": [],
    }

    for search_term in df["search_keywords"]:
        places = gmaps.places(search_term, location="33.773521, -84.391311")["results"][0]
        for category in columns:
            # Restaurants without price_level are almost always in the $-$$ range
            columns[category].append(places.get(category, 1.5))

    return pd.DataFrame(columns)

In [33]:
def get_distance(
    df: pd.DataFrame,
    gmaps: Client,
    origin: str = "112 Bobby Dodd Way NW, Atlanta, GA 30332",
) -> pd.DataFrame:
    """Returns time between two points by a given mode of transport.

    After filling the 'place_id' field, this function gives travel time for
    both driving and biking from Towers Residence Hall to each restaurant.
    API calls are made with batch size of 25 to limit number of calls.
    
    Args:
        df (pd.DataFrame): Restaurant information. Must contain column 'place_id'.
            Use get_places() to fill in the 'place_id' data.
        gmaps (googlemaps.Client): Google API client object.
    
    Returns:
        DataFrame with columns: 'driving', 'bicycling',
            'worth_driving', 'worth_bicycling'.
    """
    # Dataframe from dict of lists
    distances = {
        "driving": [],
        "bicycling": [],
        "worth_driving": [],
        "worth_bicycling": [],
    }
    
    def time_from_response(response: dict) -> time:
        """Returns travel time from a distance object."""
        tokens = response["duration"]["text"].split()
        if "hour" in tokens:
            return time(int(tokens[0]), int(tokens[2]))
        else:
            return time(0, int(tokens[0]))
        
    # Iterate over batches of max 25 rows
    for batch in np.array_split(df, np.ceil(len(df) / 25)):
        place_ids = list("place_id:" + batch["place_id"])
        
        for mode in ("driving", "bicycling"):
            response = gmaps.distance_matrix(origin, place_ids, mode=mode)["rows"][0]["elements"]
            for t in response:
                dist = time_from_response(t)
                distances[mode].append(dist)
                distances['worth_' + mode].append(dist <= time(0, 20))
        
    return pd.DataFrame(distances)

# Step 3:

Create and process restaurant, Places, and Maps information DataFrames, and concatenate them together.

In [34]:
places = get_places(filtered, gmaps)

In [35]:
filtered.reset_index(drop=True, inplace=True)
df = pd.concat([filtered, places], axis=1)

In [36]:
# If website didn't originally have price level, fill it with Google data
df['price'] = df['price'].fillna(df['price_level'])

In [37]:
# Then remove the column of Google price level
df.drop('price_level', axis=1, inplace=True)

In [38]:
distance = get_distance(df, gmaps)

In [39]:
final = pd.concat([df, distance], axis=1)

In [50]:
final.columns

Index(['name', 'description', 'price', 'search_keywords', 'website',
       'multiple_mentions', 'place_id', 'rating', 'user_ratings_total',
       'formatted_address', 'driving', 'bicycling', 'worth_driving',
       'worth_bicycling'],
      dtype='object')

In [51]:
# Reorder the columns and sort by driving time
reordered = final[[
    'name', 'price', 'rating',
    'user_ratings_total', 'driving', 'bicycling',
    'worth_driving', 'worth_bicycling', 'multiple_mentions',
    'description', 'website', 'formatted_address',
    'search_keywords', 'place_id'
]].sort_values('driving', axis=0).reset_index(drop=True)

In [52]:
reordered

Unnamed: 0,name,price,rating,user_ratings_total,driving,bicycling,worth_driving,worth_bicycling,multiple_mentions,description,website,formatted_address,search_keywords,place_id
0,The Original J.R. Cricket’s,2.0,4.0,3118,00:04:00,00:07:00,True,True,False,Fun fact: J.R. Cricket’s was name-dropped in D...,https://www.atlantaeats.com/restaurants/jr-cri...,"129 North Avenue NE, Atlanta, GA 30308, United...",The Original J.R. Cricket’s midtown restaurant,ChIJn8LbIm0E9YgREMWxAog8G60
1,Lyla Lila,2.0,4.6,255,00:05:00,00:07:00,True,True,False,Pasta is a craft Richards has been working at ...,https://www.lylalilaatl.com/,"693 Peachtree St NE UNIT 118, Atlanta, GA 3030...","693 Peachtree St NE UNIT 118, Atlanta, GA 3030...",ChIJz_3PrTsF9YgR4nwTmdhoXsE
2,Cypress Street Pint & Plate,2.0,4.5,2791,00:05:00,00:07:00,True,True,False,"If you’re looking for a deal, you’ll love thei...",https://www.facebook.com/CypressBarATL/,"817 W Peachtree St NW, Atlanta, GA 30308, Unit...",Cypress Street Pint & Plate midtown restaurant,ChIJweczQWYE9YgRvtwgBTj9cUU
3,Bon Ton,2.0,4.4,1199,00:05:00,00:10:00,True,True,False,Bon Ton is no doubt as cool as its predecessor...,http://www.bontonatl.com/,"674 Myrtle St NE, Atlanta, GA 30308, United St...","674 Myrtle Street, Midtown restaurant",ChIJIa9sJ2wE9YgRG9lUqCTz5_Y
4,Mary Mac’s Tearoom,2.0,4.4,11184,00:05:00,00:09:00,True,True,False,"According to their website, Mary Mac’s first o...",https://www.atlantaeats.com/archives/mary-mac/,"224 Ponce De Leon Ave NE, Atlanta, GA 30308, U...",Mary Mac’s Tearoom midtown restaurant,ChIJgUolJWwE9YgRnxg5IduXOsg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Osteria Mattone,2.0,4.5,686,00:30:00,00:02:00,False,True,False,Dividing the casual barroom from the white-tab...,http://www.osteriamattone.com/,"1095 Canton St, Roswell, GA 30075, United States","1095 Canton Street, Roswell restaurant",ChIJ5VbDwFNz9YgRtW1rqFc7ksY
99,Sun City Caribbean & American Restaurant,1.0,4.1,511,00:30:00,01:44:00,False,False,False,Come for the weekly soup specials like a thick...,https://www.facebook.com/suncitycaribbeanameri...,"6201 Memorial Dr, Stone Mountain, GA 30083, Un...","6201 Memorial Dr, Stone Mountain, GA 30083 res...",ChIJ-1cxRc2o9YgRXj7m83gAjA4
100,La Mixteca Tamale House,1.0,4.4,574,00:31:00,00:03:00,False,True,True,The restaurant’s specialties include all kinds...,https://www.facebook.com/lamixtecatamalehouse/,"1185 Old Peachtree Rd NW, Suwanee, GA 30024, U...","1185 Old Peachtree Road, Suwanee, Suwanee rest...",ChIJWznzYC699YgRHeuXJy4nTiI
101,Masterpiece,2.0,4.6,265,00:32:00,00:02:00,False,True,False,The first bite: Masterpiece’s dry-fried eggpla...,http://www.masterpiece-chinese.com/,"3940 Buford Hwy NE A103, Duluth, GA 30096, Uni...","3940 Buford Highway, Duluth restaurant",ChIJZQYNdEmi9YgRtaRgO6e7v54


In [53]:
with open("compendium.csv", "w") as f:
    reordered.to_csv(f)