What the Package Does (One Line, Title Case)
Branch: master
Clone or download
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Permalink
Type Name Latest commit message Commit time
Failed to load latest commit information.
data-raw
data
man/figures
.Rbuildignore
.gitignore
DESCRIPTION
NAMESPACE
README.Rmd
README.md
jerbs.Rproj

README.md

jerbs

The goal of jerbs is to scrape data politely from the department of employment's jobs website

Installation

You can install the released version of jerbs from CRAN with:

install.packages("jerbs")

Code for creating the function

library(polite)
library(tidyverse)
library(rvest)
joboutlook<-read_html("https://joboutlook.gov.au/A-Z.aspx")


links_occupation<-html_nodes(joboutlook, "div#occupations a")

links_occupation_clean<-html_attr(links_occupation, "href")

#just occupation names from site
links_occupation_clean_text<-html_text(links_occupation, "href")

##scraping individual sites

all_links_complete<-paste0("https://joboutlook.gov.au/",links_occupation_clean)
job<-read_html(all_links_complete[1])

And then creating the function so we can scrape the website data

library(purrr)

get_occupation<-function(url,occupation, bow){
  bow<-nod(bow, url)
  job<-scrape(bow)
  value<-html_text(html_nodes(job, "section.fast-facts li span.snapshot-data"))
  title<-html_text(html_nodes(job, "section.fast-facts li span.snapshot-title"))
  #gsub("\\r|\\n|\\t","", value)
  title<-strsplit(title, "\\r")
  title<-map_chr(title, 1)
  title<-gsub("\\r|\\n|\\t","", title)
  value<-gsub("\\r|\\n|\\t","", value)
  value<-trimws(value)
  title<-trimws(title)
  out<-tibble(occupation= occupation,title=title, value=value)
  return(out)
}


bow_run<-bow("https://joboutlook.gov.au/", delay = 5)
out_jobs<-purrr::map2_dfr(links_occupation_clean, links_occupation_clean_text, get_occupation, bow_run)

Cleaning the data (mostly tedious code)

out_test_spread<-out_test %>% spread(title,value )
out_test_spread<-out_test_spread %>% dplyr::rename_all(funs(make.names(.)))

out_test_spread$Average.age<-gsub(" years","",out_test_spread$Average.age)
out_test_spread$Employment.Size<-gsub("workers","",out_test_spread$Employment.Size)
out_test_spread$Full.Time.Share<-gsub("% Full-Time","",out_test_spread$Full.Time.Share)
out_test_spread$Gender.Share<-gsub("% female","",out_test_spread$Gender.Share)

out_test_spread<-out_test_spread %>%
  mutate(Weekly.Pay=recode(Weekly.Pay,"Unavailable"=NA_character_))

out_test_spread$Employment.Size<-trimws(out_test_spread$Employment.Size)

out_test_spread$Weekly.Pay<-gsub("$ ","",out_test_spread$Weekly.Pay)
out_test_spread_clean<-out_test_spread %>%
  mutate(Gender.Share=as.numeric(Gender.Share),
         Full.Time.Share=as.numeric(Full.Time.Share),
         Average.age=as.numeric(Average.age),
         Average.full.time=as.numeric(Average.full.time),
         Employment.Size=as.numeric(Employment.Size),
         Weekly.Pay=as.numeric(Weekly.Pay))

Bringin the AZSCO codes back in (because Duh)

d_new<-regmatches(links_occupation_clean, regexpr("(\\d+)", links_occupation_clean))

with_occupation_code_new<-bind_cols(out_test_spread_clean, as.data.frame(d_new))