Additional snippits

chrisalbon · Nov 2, 2013 · 020e47b · 020e47b
1 parent 905eeaf
commit 020e47b
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 0 deletions.
diff --git a/cleaning-gender.r b/cleaning-gender.r
@@ -0,0 +1,19 @@
+# Cleaning Up Gender
+# Original source: Learning R
+
+# create some messy fake gender data
+gender <- c("MALE", "Male", "male", "M", "FEMALE", "Female", "female", "f", NA)
+
+# find strings that start with m and optionally ending in ale
+clean_gender <- str_replace(
+  gender,
+  ignore.case("^m(ale)?$"),
+  "Male"
+)
+
+# find strings the start with f and optionally end in emale
+clean_gender <- str_replace(
+  clean_gender,
+  ignore.case("^f(emale)?$"),
+  "Female"
+)
diff --git a/cleaning-strings.r b/cleaning-strings.r
@@ -0,0 +1,31 @@
+# Cleaning Strings
+# Original source: Learning R
+
+# R includes grep, grepl, regexpr, sub, and gsub to handle strings. However they can be clunky, so the stringr package provides a "UI" for these functions to making working with them easier.
+
+# Load LearningR package
+library(learningr)
+
+# Load english_monarchs data from the LearningR package
+data(english_monarchs, package = "learningr")
+
+# Load Stringr Library
+library(stringr)
+
+# detect commas in the domain variable, meaning that during that time a monarch had multiple territories (domains)
+multiple_kingdoms <- str_detect(english_monarchs$domain, fixed(",")); multiple_kingdoms
+
+# index domains where multiple_kingdoms is true. Show name and domain columns for those rows where it is true. 
+english_monarchs[multiple_kingdoms, c("name", "domain")]
+
+# detect either a comma or an "and" in the ruler variable, meaning that a domain had multiple rulers
+multiple_rulers <- str_detect(english_monarchs$name, ",|and")
+
+# index domains where multiple rulers was true and that data isn't missing
+english_monarchs$name[multiple_rulers & !is.na(multiple_rulers)]
+
+# since individual rulers are split up by a comma or an and, we can split them up. The output is a list.
+individual_rulers <- str_split(english_monarchs$name, ", | and ")
+
+# take a look at the data
+head(individual_rulers[sapply(individual_rulers, length) > 1])
diff --git a/managing-data-frames.r b/managing-data-frames.r
@@ -0,0 +1,2 @@
+# Managing Data Frames
+
diff --git a/scraping-webpages.r b/scraping-webpages.r
@@ -1,3 +1,29 @@
 # Scraping Web Pages
 # Original source: Learning R
 
+# Load RCurl Library
+library(RCurl)
+
+# Create a string with the URL to the website
+time_url <- "http://tycho.usno.navy.mil/cgi-bin/timer.pl"
+
+# Download the HTML
+time_page <- getURL(time_url)
+
+# Use concatenate and view the html in a pretty way
+cat(time_page)
+
+# load XML library
+library(XML)
+
+# parse the HTML
+time_doc <- htmlParse(time_page); time_doc
+
+# extract everything within the "pre" tag. The // denotes that we are searching the entire document. The [[1]] refers to the fact we are not moving a list to pre but moving the contents of the list.
+pre <- xpathSApply(time_doc, "//pre")[[1]]
+
+# split along newline \n, divides up each time
+values <- strsplit(xmlValue(pre), "\n")[[1]][-1]
+
+# split along the tabs \t+ divides each time into time the time and timezone
+times <- strsplit(values, "\t+")
diff --git a/strings-to-logical.r b/strings-to-logical.r
@@ -0,0 +1,15 @@
+# Converting Strings To Logical
+
+# Create a string with Y and N string elements
+answers <- c("Y", "Y", "N", "Y", "N")
+
+# write a function that converts "Y" to TRUE and "N" to FALSE
+yn_to_logical <- function(x) {
+  y <- rep.int(NA, length(x)) 
+  y[x == "Y"] <- TRUE
+  y[x == "N"] <- FALSE
+  y
+}
+
+# run the function on the data
+yn_to_logical(answers)