In [287]:
import os
import re
from datetime import time
from pathlib import Path
from collections import Counter
from collections.abc import Generator
from heapq import nlargest
from itertools import chain
from string import punctuation

import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from googlemaps import Client
from bs4 import BeautifulSoup

### Extract data from static files

In [288]:
data = Path('data')
num_rgx = re.compile(r'\d+\.\s')

In [289]:
# Step 1: Parse the soup and assemble information about each restaurant
def build_atl_mag_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    """Parses the soup and assembles information about a restaurant."""
    for r in soup.select("div.restLRContainer"):
        price = r.select_one("div.price")
        span = price.select_one("span")
        if span:
            span.decompose()
        
        website = r.select_one("em > a")
        address = r.select_one("div.address").text
        city = r.select_one("div.neighbor").text

        yield {
            "name": re.sub(num_rgx, '', r.select_one("h2.restName").text),
            "description": summarize(r.select_one("div.restLeft > p").text.strip()),
            "price": price.text.count('$'),
            "search_term": f"{address}, {city} restaurant",
            "website": website['href'] if website else np.nan,
            "multiple_mentions": False,
        }

In [290]:
def build_atl_eater_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    ignore = ', '.join(f"[data-slug='{ignore}']" for ignore in ('intro', 'newsletter', 'related-links'))
    for r in soup.select(f"main > section.c-mapstack__card:not({ignore})"):
        website = r.select("div.c-mapstack__info > div.info")
        if website:
            website = website[2].select_one('div:not([class]) > a').get('href', np.nan) if len(website) == 3 else np.nan
        
        yield {
            "name": r.select_one("div.c-mapstack__card-hed > div > h1").text,
            "description": summarize(r.select_one("div.c-entry-content > p").text.strip()),
            "price": np.nan,
            "search_term": f'{r.select_one("div.c-mapstack__address > a").text} restaurant',
            "website": website,
            "multiple_mentions": False,
        }

In [291]:
def build_midtown_restaurants(
    soup: BeautifulSoup,
) -> Generator[dict[str, str], None, None]:
    content = midtown.select_one("div.entry-content")

    titles = content.select("h2 > strong")[:-1]
    descriptions = content.select("p")[2:-2]
    if len(titles) != 10 and len(descriptions) != 10:
        exit("Incorrect scraping on midtown website.")

    for title, description in zip(titles, descriptions):
        website = title.select_one('a')
        yield {
            "name": re.sub(num_rgx, '', title.text),
            "description": summarize(description.text),
            "price": np.nan,
            "search_term": f"{title.text[3:]} midtown restaurant",
            "website": website['href'] if website else np.nan,
            "multiple_mentions": False,
        }

In [270]:
def summarize(text: str, sentence_count: int = 3) -> str:
    doc = spacy.load('en_core_web_sm')(text)

    word_frequencies = Counter()
    for word in doc:
        word = word.text.lower()
        if word not in STOP_WORDS and word not in punctuation:
            word_frequencies[word] += 1

    max_frequency = word_frequencies.most_common(1)[0][1]

    for word in word_frequencies.values():
        word = word / max_frequency

    sentence_scores = Counter()
    for sent in doc.sents:
        for word in sent:
            word = word.text.lower()
            if word in word_frequencies:
                sentence_scores[sent] += word_frequencies[word]

    summary = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
    return ' '.join(word.text for word in summary)

In [271]:
def summarize(text: str) -> str:
    return ''

In [272]:
with open(data / 'atl_mag.html') as f:
    mag = BeautifulSoup(f, 'html.parser')
with open(data / 'atl_eater.html') as f:
    eater = BeautifulSoup(f, 'html.parser')
with open(data / 'midtown.html') as f:
    midtown = BeautifulSoup(f, 'html.parser')

In [292]:
df = pd.DataFrame(
    chain(
        build_atl_mag_restaurants(mag),
        build_atl_eater_restaurants(eater),
        build_midtown_restaurants(midtown),
    )
)

In [293]:
df.dtypes

name                  object
description           object
price                float64
search_term           object
website               object
multiple_mentions       bool
dtype: object

In [294]:
# EDGE CASES
df.loc[df['name'] == 'Spring Restaurant', 'name'] = 'Spring'

In [295]:
# Extract duplicate rows and flip value of 'multiple_mentions'
dups = df[df.duplicated('name', keep='last')].replace(False, True)
# Drop *all* duplicates from main dataframe
filtered = df.drop_duplicates(subset='name', keep=False)
# Append modified duplicates back to main dataframe
filtered = pd.concat([dups, filtered], axis=0, join='outer')

In [296]:
filtered['multiple_mentions']

1       True
2       True
3       True
5       True
7       True
       ...  
115    False
116    False
117    False
118    False
120    False
Name: multiple_mentions, Length: 103, dtype: bool

In [297]:
gmaps = Client(key=os.environ["GOOGLE_API_KEY"])

In [259]:
def get_places(df: pd.DataFrame, gmaps: Client) -> pd.DataFrame:
    """Creates dataframe with place information.

    Expects a dataframe with column "search_term". Returns columns "place_id", "price_level",
    "rating", "user_ratings_total", and "formatted_address".
    Uses Google Places API.
    """
    # Dataframe from dict of lists
    columns = {
        "place_id": [],
        "price_level": [],
        "rating": [],
        "user_ratings_total": [],
        "formatted_address": [],
    }

    for search_term in df["search_term"]:
        places = gmaps.places(search_term, location="33.773521, -84.391311")["results"][0]
        for category in columns:
            # Restaurants without price_level are almost always in the $-$$ range
            columns[category].append(places.get(category, 1.5))

    return pd.DataFrame(columns)

In [260]:
def time_from_response(response: dict) -> time:
    """Returns travel time from a distance object."""
    tokens = response["duration"]["text"].split()
    if "hour" in tokens:
        return time(int(tokens[0]), int(tokens[2]))
    else:
        return time(0, int(tokens[0]))

In [261]:
def get_distance(
    df: pd.DataFrame,
    gmaps: Client,
    origin: str = "112 Bobby Dodd Way NW, Atlanta, GA 30332",
) -> pd.DataFrame:
    """Returns time between two points by a given mode of transport.

    Expects a dataframe with column "place_id". Returns columns "car_dist",
    "bike_dist", and "is_bikable".
    Uses Google Distance Matrix API.
    """
    # Dataframe from dict of lists
    distances = {
        "driving": [],
        "bicycling": [],
        "worth_driving": [],
        "worth_bicycling": [],
    }
    # Iterate over batches of max 25 rows
    for batch in np.array_split(df, np.ceil(len(df) / 25)):
        place_ids = list("place_id:" + batch["place_id"])
        
        for mode in ("driving", "bicycling"):
            response = gmaps.distance_matrix(origin, place_ids, mode=mode)["rows"][0]["elements"]
            for t in response:
                dist = time_from_response(t)
                distances[mode].append(dist)
                distances['worth_' + mode].append(dist <= time(0, 20))
        
    return pd.DataFrame(distances)

In [262]:
places = get_places(filtered, gmaps)

In [298]:
filtered.reset_index(drop=True, inplace=True)
df = pd.concat([filtered, places], axis=1)

In [300]:
df['price'] = df['price'].fillna(df['price_level'])

In [304]:
df.drop('price_level', axis=1, inplace=True)

In [305]:
distance = get_distance(df, gmaps)

In [308]:
final = pd.concat([df, distance], axis=1)

In [310]:
final.columns

Index(['name', 'description', 'price', 'search_term', 'website',
       'multiple_mentions', 'place_id', 'rating', 'user_ratings_total',
       'formatted_address', 'driving', 'bicycling', 'worth_driving',
       'worth_bicycling'],
      dtype='object')

In [322]:
reordered = final[[
    'name', 'price', 'rating',
    'user_ratings_total', 'driving', 'bicycling',
    'worth_driving', 'worth_bicycling', 'multiple_mentions',
    'description', 'website', 'formatted_address',
    'search_term', 'place_id'
]].sort_values('driving', axis=0)

In [323]:
reordered

Unnamed: 0,name,price,rating,user_ratings_total,driving,bicycling,worth_driving,worth_bicycling,multiple_mentions,description,website,formatted_address,search_term,place_id
102,The Original J.R. Cricket’s,2.0,4.0,3118,00:04:00,00:07:00,True,True,False,,https://www.atlantaeats.com/restaurants/jr-cri...,"129 North Avenue NE, Atlanta, GA 30308, United...",The Original J.R. Cricket’s midtown restaurant,ChIJn8LbIm0E9YgREMWxAog8G60
94,Mary Mac’s Tearoom,2.0,4.4,11183,00:05:00,00:09:00,True,True,False,,https://www.atlantaeats.com/archives/mary-mac/,"224 Ponce De Leon Ave NE, Atlanta, GA 30308, U...",Mary Mac’s Tearoom midtown restaurant,ChIJgUolJWwE9YgRnxg5IduXOsg
34,Bon Ton,2.0,4.4,1199,00:05:00,00:10:00,True,True,False,,http://www.bontonatl.com/,"674 Myrtle St NE, Atlanta, GA 30308, United St...","674 Myrtle Street, Midtown restaurant",ChIJIa9sJ2wE9YgRG9lUqCTz5_Y
98,Cypress Street Pint & Plate,2.0,4.5,2791,00:05:00,00:07:00,True,True,False,,https://www.facebook.com/CypressBarATL/,"817 W Peachtree St NW, Atlanta, GA 30308, Unit...",Cypress Street Pint & Plate midtown restaurant,ChIJweczQWYE9YgRvtwgBTj9cUU
80,Lyla Lila,2.0,4.6,255,00:05:00,00:07:00,True,True,False,,https://www.lylalilaatl.com/,"693 Peachtree St NE UNIT 118, Atlanta, GA 3030...","693 Peachtree St NE UNIT 118, Atlanta, GA 3030...",ChIJz_3PrTsF9YgR4nwTmdhoXsE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Osteria Mattone,2.0,4.5,686,00:30:00,00:02:00,False,True,False,,http://www.osteriamattone.com/,"1095 Canton St, Roswell, GA 30075, United States","1095 Canton Street, Roswell restaurant",ChIJ5VbDwFNz9YgRtW1rqFc7ksY
93,Sun City Caribbean & American Restaurant,1.0,4.1,511,00:30:00,01:44:00,False,False,False,,https://www.facebook.com/suncitycaribbeanameri...,"6201 Memorial Dr, Stone Mountain, GA 30083, Un...","6201 Memorial Dr, Stone Mountain, GA 30083 res...",ChIJ-1cxRc2o9YgRXj7m83gAjA4
17,La Mixteca Tamale House,1.0,4.4,574,00:31:00,00:03:00,False,True,True,,https://www.facebook.com/lamixtecatamalehouse/,"1185 Old Peachtree Rd NW, Suwanee, GA 30024, U...","1185 Old Peachtree Road, Suwanee, Suwanee rest...",ChIJWznzYC699YgRHeuXJy4nTiI
18,Masterpiece,2.0,4.4,781,00:32:00,00:02:00,False,True,False,,http://www.masterpiece-chinese.com/,"3940 Buford Hwy NE b103, Duluth, GA 30096, Uni...","3940 Buford Highway, Duluth restaurant",ChIJZQYNdEmi9YgRRtuX9ZzsRQ4


In [324]:
with open("compendium.csv", "w") as f:
    reordered.to_csv(f)