In [43]:
import os
import re
from datetime import time
from pathlib import Path
from collections import Counter
from collections.abc import Generator
from heapq import nlargest
from itertools import chain
from string import punctuation

import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from googlemaps import Client
from bs4 import BeautifulSoup

### Extract data from static files

In [44]:
data = Path('data')
num_rgx = re.compile(r'\d+\.\s')

In [93]:
# Step 1: Parse the soup and assemble information about each restaurant
def build_atl_mag_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    """Parses the soup and assembles information about a restaurant."""
    for r in soup.select("div.restLRContainer"):
        price = r.select_one("div.price")
        span = price.select_one("span")
        if span:
            span.decompose()
        
        website = r.select_one("em > a")
        address = r.select_one("div.address").text
        city = r.select_one("div.neighbor").text

        yield {
            "name": re.sub(num_rgx, '', r.select_one("h2.restName").text),
            "description": summarize(r.select_one("div.restLeft > p").text.strip()),
            "price": price.text.count('$'),
            "search_term": f"{address}, {city} restaurant",
            "website": website['href'] if website else np.nan,
            "multiple_mentions": False,
        }

In [94]:
def build_atl_eater_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    ignore = ', '.join(f"[data-slug='{ignore}']" for ignore in ('intro', 'newsletter', 'related-links'))
    for r in soup.select(f"main > section.c-mapstack__card:not({ignore})"):
        website = r.select("div.c-mapstack__info > div.info")
        if website:
            website = website[2].select_one('div:not([class]) > a').get('href', np.nan) if len(website) == 3 else np.nan
        
        yield {
            "name": r.select_one("div.c-mapstack__card-hed > div > h1").text,
            "description": summarize(r.select_one("div.c-entry-content > p").text.strip()),
            "price": np.nan,
            "search_term": f'{r.select_one("div.c-mapstack__address > a").text} restaurant',
            "website": website,
            "multiple_mentions": False,
        }

In [95]:
def build_midtown_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    content = midtown.select_one("div.entry-content")

    titles = content.select("h2 > strong")[:-1]
    descriptions = content.select("p")[2:-2]
    if len(titles) != 10 and len(descriptions) != 10:
        exit("Incorrect scraping on midtown website.")

    for title, description in zip(titles, descriptions):
        website = title.select_one('a')
        yield {
            "name": re.sub(num_rgx, '', title.text),
            "description": summarize(description.text),
            "price": np.nan,
            "search_term": f"{title.text[3:]} midtown restaurant",
            "website": website['href'] if website else np.nan,
            "multiple_mentions": False,
        }

In [87]:
def summarize(text: str, sentence_count: int = 3) -> str:
    doc = spacy.load('en_core_web_sm')(text)

    word_frequencies = Counter()
    for word in doc:
        word = word.text.lower()
        if word not in STOP_WORDS and word not in punctuation:
            word_frequencies[word] += 1

    max_frequency = word_frequencies.most_common(1)[0][1]

    for word in word_frequencies.values():
        word = word / max_frequency

    sentence_scores = Counter()
    for sent in doc.sents:
        for word in sent:
            word = word.text.lower()
            if word in word_frequencies:
                sentence_scores[sent] += word_frequencies[word]

    summary = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
    return ' '.join(word.text for word in summary)

In [88]:
def summarize(text: str) -> str:
    return ''

In [55]:
with open(data / 'atl_mag.html') as f:
    mag = BeautifulSoup(f, 'html.parser')
with open(data / 'atl_eater.html') as f:
    eater = BeautifulSoup(f, 'html.parser')
with open(data / 'midtown.html') as f:
    midtown = BeautifulSoup(f, 'html.parser')

In [152]:
df = pd.DataFrame(
    chain(
        build_atl_mag_restaurants(mag),
        build_atl_eater_restaurants(eater),
        build_midtown_restaurants(midtown),
    )
)

In [153]:
df.dtypes

name                  object
description           object
price                float64
search_term           object
website               object
multiple_mentions       bool
dtype: object

In [160]:
# EDGE CASES
df.loc[df['name'] == 'Spring Restaurant', 'name'] = 'Spring'

In [166]:
# Extract duplicate rows and flip value of 'multiple_mentions'
dups = df[df.duplicated('name', keep='last')].replace(False, True)
# Drop *all* duplicates from main dataframe
filtered = df.drop_duplicates(subset='name', keep=False)
# Append modified duplicates back to main dataframe
filtered = pd.concat([dups, filtered], axis=0, join='outer')

In [167]:
filtered['multiple_mentions']

1       True
2       True
3       True
5       True
7       True
       ...  
115    False
116    False
117    False
118    False
120    False
Name: multiple_mentions, Length: 103, dtype: bool

In [114]:
df

Unnamed: 0,name,description,price,search_term,website,multiple_mentions
0,Masterpiece,,2.0,"3940 Buford Highway, Duluth restaurant",http://www.masterpiece-chinese.com/,False
1,Spring,,3.0,"36 Mill Street, Marietta restaurant",https://www.springmarietta.com/,False
2,Sushi Hayakawa,,4.0,"5979 Buford Highway, Doraville restaurant",https://sushihayakawa.com/,False
3,Bacchanalia,,4.0,"1460 Ellsworth Industrial Boulevard, Westside ...",http://www.starprovisions.com/bacchanalia,False
4,Boccalupo,,2.0,"753 Edgewood Avenue, Inman Park restaurant",http://boccalupoatl.com/,False
...,...,...,...,...,...,...
116,The Vortex Bar & Grill,,,The Vortex Bar & Grill midtown restaurant,,False
117,Tuk Tuk,,,Tuk Tuk midtown restaurant,https://www.atlantaeats.com/restaurants/tuk-tu...,False
118,The Establishment,,,The Establishment midtown restaurant,https://www.atlantaeats.com/restaurants/establ...,False
119,The Optimist,,,The Optimist midtown restaurant,https://www.atlantaeats.com/restaurants/optimi...,False


In [12]:
gmaps = Client(key=os.environ["GOOGLE_API_KEY"])

In [34]:
def get_place(df: pd.DataFrame, gmaps: Client) -> pd.DataFrame:
    """Creates dataframe with place information.

    Expects a dataframe with column "search_term". Returns columns "place_id", "price_level",
    "rating", "user_ratings_total", and "formatted_address".
    Uses Google Places API.
    """
    # Dataframe from dict of lists
    columns = {
        "place_id": [],
        "price_level": [],
        "rating": [],
        "user_ratings_total": [],
        "formatted_address": [],
    }

    for search_term in df["search_term"]:
        places = gmaps.places(search_term, location="33.773521, -84.391311")["results"][0]
        for category in columns:
            # Restaurants without price_level are almost always in the $-$$ range
            columns[category].append(places.get(category, 1.5))

    return pd.DataFrame(columns)

In [216]:
def time_from_response(response: dict) -> time:
    """Returns travel time from a distance object."""
    tokens = response["duration"]["text"].split()
    if "hour" in tokens:
        return time(int(tokens[0]), int(tokens[2]))
    else:
        return time(0, int(tokens[0]))

In [241]:
def get_distance(
    df: pd.DataFrame,
    gmaps: Client,
    origin: str = "112 Bobby Dodd Way NW, Atlanta, GA 30332",
) -> pd.DataFrame:
    """Returns time between two points by a given mode of transport.

    Expects a dataframe with column "place_id". Returns columns "car_dist",
    "bike_dist", and "is_bikable".
    Uses Google Distance Matrix API.
    """
    # Dataframe from dict of lists
    distances = {
        "driving": [],
        "bicycling": [],
        "worth_driving": [],
        "worth_bicycling": [],
    }
    # Iterate over batches of max 25 rows
    for batch in np.array_split(df, np.ceil(len(df) / 25)):
        place_ids = list("place_id:" + batch["place_id"])
        
        for mode in ("driving", "bicycling"):
            response = gmaps.distance_matrix(origin, place_ids, mode=mode)["rows"][0]["elements"]
            for t in response:
                dist = time_from_response(t)
                distances[mode].append(dist)
                distances['worth_' + mode].append(dist <= time(0, 20))
        
    return pd.DataFrame(distances)

In [None]:
df = df.append(get_place(df, gmaps))
df = df.append(get_distance(df, gmaps))

with open(data / "output.csv", "w") as f:
    df.to_csv(f, index=False)