In [1]:
library(tidyverse)
library(rvest)
library(stringr)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘rvest’


The following object is masked from ‘package:readr’:

    guess_encoding




In [45]:
# Get department links
department_links <- read_html("https://centralsquare.dailytable.org/departments") %>%
    html_nodes(".widget-item") %>%
    html_attr("href")

In [47]:
# Get category links
category_links <- list()
for (department_link in department_links) {
    base_url <- "https://centralsquare.dailytable.org"
    category_links <- c(
        category_links,
        read_html(paste0(base_url, department_link)) %>%
            html_nodes("a") %>%
            html_attr("href") %>%
            str_subset("/store/daily-table-central-square/category/") %>%
            unique()
    )
}

In [54]:
get_prices <- function(page) {
    prices <- page %>%
        html_nodes("p.h5") %>%
        html_text() %>%
        str_squish()

    names <- page %>%
        html_nodes("h3.product-title-card") %>%
        html_text() %>%
        str_squish()

    return(tibble("Item"=names, "Price"=prices) %>%
        separate(Price, c("Price", "Units"), " / ")
        )
}

price_list <- tibble()

for (category_link in category_links) {
    # Get first page
    page1 <- read_html(paste0(base_url, category_link, "?page=", 1))

    # Determine total number of items
    category_item_count <- page1 %>%
        html_nodes(".text-grey") %>%
        html_text() %>%
        str_subset("Found") %>%
        str_extract("[0-9]+") %>%
        as.numeric()

    # Get items the first page
    price_list <- price_list %>% bind_rows(get_prices(page1))

    # Get items from subsequent pages
    page_count <- ceiling(category_item_count / 24)
    for (iter_page in 2:page_count) {
        page <- read_html(paste0(base_url, category_link, "?page=", iter_page))
        price_list <- price_list %>% bind_rows(get_prices(page))
    }
}


In [56]:
price_list %>%
    write_csv("daily_table_prices.csv")

price_list

Item,Price,Units
<chr>,<chr>,<chr>
Baby Carrots,$ 1.29,each
Baby Spinach,$ 1.99,lb
Beets,$ 0.69,lb
Broccoli,$ 1.29,lb
Brussels Sprouts,$ 1.49,lb
Butternut Squash,$ 0.79,lb
Carrots,$ 0.69,lb
Cauliflower,$ 3.29,each
Celery Bunch,$ 1.49,each
Collard Greens,$ 1.89,lb
