From 020e47b0fe91f1db8cb1c6b32bcdbbe6093d57a1 Mon Sep 17 00:00:00 2001 From: "Chris R. Albon" Date: Sat, 2 Nov 2013 14:57:07 -0400 Subject: [PATCH] Additional snippits --- cleaning-gender.r | 19 +++++++++++++++++++ cleaning-strings.r | 31 +++++++++++++++++++++++++++++++ managing-data-frames.r | 2 ++ scraping-webpages.r | 26 ++++++++++++++++++++++++++ strings-to-logical.r | 15 +++++++++++++++ 5 files changed, 93 insertions(+) create mode 100644 cleaning-gender.r create mode 100644 cleaning-strings.r create mode 100644 managing-data-frames.r create mode 100644 strings-to-logical.r diff --git a/cleaning-gender.r b/cleaning-gender.r new file mode 100644 index 0000000..bc8f209 --- /dev/null +++ b/cleaning-gender.r @@ -0,0 +1,19 @@ +# Cleaning Up Gender +# Original source: Learning R + +# create some messy fake gender data +gender <- c("MALE", "Male", "male", "M", "FEMALE", "Female", "female", "f", NA) + +# find strings that start with m and optionally ending in ale +clean_gender <- str_replace( + gender, + ignore.case("^m(ale)?$"), + "Male" +) + +# find strings the start with f and optionally end in emale +clean_gender <- str_replace( + clean_gender, + ignore.case("^f(emale)?$"), + "Female" +) \ No newline at end of file diff --git a/cleaning-strings.r b/cleaning-strings.r new file mode 100644 index 0000000..e989c8b --- /dev/null +++ b/cleaning-strings.r @@ -0,0 +1,31 @@ +# Cleaning Strings +# Original source: Learning R + +# R includes grep, grepl, regexpr, sub, and gsub to handle strings. However they can be clunky, so the stringr package provides a "UI" for these functions to making working with them easier. + +# Load LearningR package +library(learningr) + +# Load english_monarchs data from the LearningR package +data(english_monarchs, package = "learningr") + +# Load Stringr Library +library(stringr) + +# detect commas in the domain variable, meaning that during that time a monarch had multiple territories (domains) +multiple_kingdoms <- str_detect(english_monarchs$domain, fixed(",")); multiple_kingdoms + +# index domains where multiple_kingdoms is true. Show name and domain columns for those rows where it is true. +english_monarchs[multiple_kingdoms, c("name", "domain")] + +# detect either a comma or an "and" in the ruler variable, meaning that a domain had multiple rulers +multiple_rulers <- str_detect(english_monarchs$name, ",|and") + +# index domains where multiple rulers was true and that data isn't missing +english_monarchs$name[multiple_rulers & !is.na(multiple_rulers)] + +# since individual rulers are split up by a comma or an and, we can split them up. The output is a list. +individual_rulers <- str_split(english_monarchs$name, ", | and ") + +# take a look at the data +head(individual_rulers[sapply(individual_rulers, length) > 1]) \ No newline at end of file diff --git a/managing-data-frames.r b/managing-data-frames.r new file mode 100644 index 0000000..04a8162 --- /dev/null +++ b/managing-data-frames.r @@ -0,0 +1,2 @@ +# Managing Data Frames + diff --git a/scraping-webpages.r b/scraping-webpages.r index 82501de..1fa2362 100644 --- a/scraping-webpages.r +++ b/scraping-webpages.r @@ -1,3 +1,29 @@ # Scraping Web Pages # Original source: Learning R +# Load RCurl Library +library(RCurl) + +# Create a string with the URL to the website +time_url <- "http://tycho.usno.navy.mil/cgi-bin/timer.pl" + +# Download the HTML +time_page <- getURL(time_url) + +# Use concatenate and view the html in a pretty way +cat(time_page) + +# load XML library +library(XML) + +# parse the HTML +time_doc <- htmlParse(time_page); time_doc + +# extract everything within the "pre" tag. The // denotes that we are searching the entire document. The [[1]] refers to the fact we are not moving a list to pre but moving the contents of the list. +pre <- xpathSApply(time_doc, "//pre")[[1]] + +# split along newline \n, divides up each time +values <- strsplit(xmlValue(pre), "\n")[[1]][-1] + +# split along the tabs \t+ divides each time into time the time and timezone +times <- strsplit(values, "\t+") \ No newline at end of file diff --git a/strings-to-logical.r b/strings-to-logical.r new file mode 100644 index 0000000..472d4a1 --- /dev/null +++ b/strings-to-logical.r @@ -0,0 +1,15 @@ +# Converting Strings To Logical + +# Create a string with Y and N string elements +answers <- c("Y", "Y", "N", "Y", "N") + +# write a function that converts "Y" to TRUE and "N" to FALSE +yn_to_logical <- function(x) { + y <- rep.int(NA, length(x)) + y[x == "Y"] <- TRUE + y[x == "N"] <- FALSE + y +} + +# run the function on the data +yn_to_logical(answers) \ No newline at end of file