Skip to content

Commit

Permalink
Refactored parse_cjsg()
Browse files Browse the repository at this point in the history
  • Loading branch information
clente committed Feb 14, 2018
1 parent 3c5271c commit 4facbd3
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 109 deletions.
98 changes: 0 additions & 98 deletions R/parse-cjsg.R

This file was deleted.

121 changes: 121 additions & 0 deletions R/parse_cjsg.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#' Parse lawsuits extracted from CJSG query
#'
#' @param file Character vector with the paths to one or more files
#' @param cores Number of cores to use when parsing
#'
#' @return A tibble with the columns
#' \itemize{
#' \item `file` Name of the file
#' \item `id_page` ID found in the page
#' \item `id_decision` Unique ID of the ruling
#' \item `id_lawsuit` Number of the lawsuit (doesn't have to be unique)
#' \item `class_subject` Class/subject, separated by slashes
#' \item `district` Name of the district
#' \item `court` Body responsible for the appeal
#' \item `date_decision` Date of the judgement (\%d/\%m/\%Y)
#' \item `date_publication` Date of the publication (\%d/\%m/\%Y)
#' \item `date_registration` Date of registration in the system (\%d/\%m/\%Y)
#' \item `rapporteur` Name of the rapporteur
#' \item `summary` Summary of the ruling
#' \item `txt_summary` Text of the summary with no formatting
#' }
#' @export
parse_cjsg <- function(file, cores = 1) {

# Set names for .id
names(file) <- file

# Run either with progress bar or on parallel
if (cores == 1) {
pb <- progress::progress_bar$new(total = length(file))
purrr::map_dfr(file, parse_cjsg_, pb, .id = "file")
} else {
file %>%
parallel::mclapply(parse_cjsg_, mc.cores = cores) %>%
dplyr::bind_rows(.id = "file")
}
}

#' Parse a page of CJSG results
#'
#' @param file The path to the file to be parsed
#' @param pb Progress bar created by [parse_cjsg()]
#' @return A tibble with the parsed information
parse_cjsg_ <- function(file, pb = NULL) {

# Safely parse everything
parse <- purrr::possibly(parse_cjsg_lawsuit, tibble::tibble(), quiet = FALSE)

# Iterate over xml nodes to parse every lawsuit
table <- file %>%
xml2::read_html("UTF-8") %>%
rvest::html_nodes(".fundocinza1") %>%
purrr::map_dfr(parse)

if (!is.null(pb)) { pb$tick() }
return(table)
}

#' Parse one lawsuit from a CJSG page
#'
#' @param node A `.fundocinza1` node extracted from the page
#' @return One row with the data concerning the lawsuit
parse_cjsg_lawsuit <- function(node) {

# Auxiliary function to fill in missing columns in table
fill_in_columns <- function(data) {

# Fill in ementa and publicacao
if (!tibble::has_name(data, "ementa"))
data <- dplyr::mutate(data, ementa = NA_character_)
if (!tibble::has_name(data, "data_publicacao"))
data <- dplyr::mutate(data, data_publicacao = NA_character_)

return(data)
}

# Get information from lawsuit
tmp <- rvest::html_node(node, ".downloadEmenta")
infos <- tibble::tibble(
id_lawsuit = stringr::str_trim(rvest::html_text(tmp)),
id_decision = rvest::html_attr(tmp, "cdacordao"))

# Get complicated variables
id <- node %>%
rvest::html_node(".ementaClass") %>%
rvest::html_text() %>%
stringr::str_trim() %>%
stringr::str_replace_all("[^0-9]", "")
cs <- node %>%
rvest::html_node(".assuntoClasse") %>%
rvest::html_text() %>%
stringr::str_trim()
ts <- node %>%
rvest::html_node("textarea") %>%
rvest::html_text()

# Create final table
node %>%
rvest::html_nodes(".ementaClass2") %>%
rvest::html_text() %>%
stringr::str_split_fixed(":", 2) %>%
tibble::as_tibble() %>%
purrr::set_names(c("key", "val")) %>%
dplyr::mutate_all(stringr::str_trim) %>%
dplyr::mutate(
key = key %>%
abjutils::rm_accent() %>%
stringr::str_to_lower() %>%
stringr::str_replace_all(" +", "_") %>%
stringr::str_replace_all("[^a-z_]", "") %>%
stringr::str_replace_all("_d[eo]_", "_")) %>%
tidyr::spread(key, val) %>%
dplyr::bind_cols(infos) %>%
fill_in_columns() %>%
dplyr::mutate(id = id, cs = cs, ts = ts) %>%
dplyr::select(
id_page = id, id_decision, id_lawsuit, class_subject = cs,
district = comarca, court = orgao_julgador, date_decision = data_julgamento,
date_publication = data_publicacao, date_registration = data_registro,
rapporteur = relatora, summary = ementa, txt_summary = ts)
}
22 changes: 11 additions & 11 deletions man/parse_cjsg.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions man/parse_cjsg_.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/parse_cjsg_lawsuit.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 4facbd3

Please sign in to comment.