In [1]:
library(tidyverse)
library(rvest)
library(stringr)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘rvest’


The following object is masked from ‘package:readr’:

    guess_encoding




In [60]:
# Get department links
department_links <- read_html("https://centralsquare.dailytable.org/departments") %>%
    html_nodes(".widget-item") %>%
    html_attr("href")

In [61]:
# Get category links
category_links <- list()
for (department_link in department_links) {
    base_url <- "https://centralsquare.dailytable.org"
    category_links <- c(
        category_links,
        read_html(paste0(base_url, department_link)) %>%
            html_nodes("a") %>%
            html_attr("href") %>%
            str_subset("/store/daily-table-central-square/category/") %>%
            unique()
    )
}

In [85]:
get_prices <- function(page) {
    output <- 
        page %>%
        html_nodes("a.product-card-description") %>%
        html_text() %>%
        str_squish() %>%
        tibble() %>%
        mutate(
            Price = as.numeric(str_match(., "\\$ (\\d+.\\d+)")[,2]),
            Qty = str_match(., "(\\d+[.\\d]*) (?:oz|lb|gal|ct|fl oz)")[,2],
            Units = str_match(., "(?:\\d+[.\\d]*) (oz|lb|gal|ct|fl oz)")[,2],
            Units = if_else(is.na(Units), str_match(., "\\d \\/ ([a-z]+) ")[,2], Units),
            Qty = if_else(is.na(Qty), 1, as.numeric(Qty))
        ) %>%
        select(-.)

    items <- page %>%
        html_nodes(".product-title-card") %>%
        html_text()

    output <- output %>%
        mutate(Item = items) %>%
        select(Item, Qty, Units, Price)

    return(output)
}

In [87]:
price_list <- tibble()

for (category_link in category_links) {
    # Get first page
    page1 <- read_html(paste0(base_url, category_link, "?page=", 1))

    # Determine total number of items
    category_item_count <- page1 %>%
        html_nodes(".text-grey") %>%
        html_text() %>%
        str_subset("Found") %>%
        str_extract("[0-9]+") %>%
        as.numeric()

    # Get items the first page
    price_list <- price_list %>% bind_rows(get_prices(page1))

    # Get items from subsequent pages
    page_count <- ceiling(category_item_count / 24)
    for (iter_page in 2:page_count) {
        page <- read_html(paste0(base_url, category_link, "?page=", iter_page))
        price_list <- price_list %>% bind_rows(get_prices(page))
    }
}


In [88]:
price_list %>%
    write_csv("daily_table_prices.csv")

price_list

Item,Qty,Units,Price
<chr>,<dbl>,<chr>,<dbl>
Baby Carrots,1,lb,1.29
Baby Spinach,1,lb,1.99
Beets,1,lb,0.69
Broccoli,1,lb,1.29
Brussels Sprouts,1,lb,1.49
Butternut Squash,1,lb,0.79
Carrots,1,lb,0.69
Cauliflower,1,each,3.29
Celery Bunch,1,each,1.49
Collard Greens,1,lb,1.89
