# Reading Local Data 
## Set Your Working Directory

In [None]:
setwd("./data")

In [None]:
# checking for and creating directories
# file.exists("directoryName") will check to see if the directory exists
if (!file.exists("data")) {
    dir.create("data")
}

## Getting Data From The Internet 

In [None]:
# download a file from the web
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
# add method="curl" if you're using MacOS
download.file(fileUrl, destfile = "./data/cameras.csv")

# it's important to record when you downloaded 
# since files on the internet may be upgraded. 
dateDownload <- data()

## Reading Local Files

In [None]:
# need to specify lots of parameters using read.table()
# the parameter "quote" means the quotation marks used in the file
# which sometimes cause troubles if not correctly specified. 
cameraData <- read.table("./data/cameras.csv", sep = ",", header = TRUE, quote = "")
# or use read.csv(), which sets sep="," and header=T 
cameraData <- read.csv("./data/cameras.csv")

head(cameraData)    # peek at first six rows

## Reading Excel Files

In [None]:
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xlsx?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/cameras.xlsx")
dateDownload <- date()

In [None]:
library(xlsx)    # or XLConnect package 
cameraData <- read.xlsx("./data/cameras.xlsx", sheetIndex = 1,
                        header = T, colIndex = 2:3, rowIndex = 1:4)

---
# Reading From MySQL
(to be continued)

---
# [data.table package](https://github.com/Rdatatable/data.table/wiki/Getting-started)
Written in C so it is much faster at subsetting, group, and updating

All functions that accept data.frame work on data.table.

In [29]:
# Create data tables just like data frames
DF <- data.frame(x = rnorm(9), y = rep(c("a", "b", "c"), each = 3), z= rnorm(9))
head(DF, 3)

library(data.table)
DT <- data.table(x = rnorm(9), y = rep(c("a", "b", "c"), each = 3), z= rnorm(9))
head(DT, 3)

x,y,z
-0.97561181,a,-1.1324814
-0.07298295,a,-0.336479
-1.43020118,a,0.7429737


x,y,z
-0.5206083,a,-0.3558505
-0.2176224,a,-1.9773772
-0.181149,a,1.0182433


In [38]:
# subsetting rows
DT[2, ]
DT[c(3,4), ]
DT[DT$y == "a", ]

x,y,z
-0.2176224,a,-1.977377


x,y,z
-0.18114895,a,1.018243
0.01633433,b,-1.334388


x,y,z
-0.5206083,a,-0.3558505
-0.2176224,a,-1.9773772
-0.181149,a,1.0182433


In [39]:
# subsetting columns
DT[ , c(2, 3)]

y,z
a,-0.3558505
a,-1.9773772
a,1.0182433
b,-1.3343884
b,-0.582134
b,0.8413259
c,-0.1059262
c,-0.4242396
c,-0.3842031


In [42]:
# adding new columns
DT[ , w := z^2]
DT[ , y := 2]
DT

"Coerced 'double' RHS to 'character' to match the column's type; may have truncated precision. Either change the target column to 'double' first (by creating a new 'double' vector length 9 (nrows of entire table) and assign that; i.e. 'replace' column), or coerce RHS to 'character' (e.g. 1L, NA_[real|integer]_, as.*, etc) to make your intent clear and for speed. Or, set the column type correctly up front when you create the table and stick to it, please."

x,y,z,w
-0.52060827,2,-0.3558505,0.12662958
-0.21762236,2,-1.9773772,3.91002067
-0.18114895,2,1.0182433,1.03681934
0.01633433,2,-1.3343884,1.7805924
0.23565964,2,-0.582134,0.33888002
0.73513963,2,0.8413259,0.70782927
-0.08616578,2,-0.1059262,0.01122037
1.78127433,2,-0.4242396,0.17997922
-0.48811455,2,-0.3842031,0.14761203


In [44]:
# multiple operations
# write expressions in {}
DT[, m := {tmp <- (x+z); log2(tmp+5)}]
DT

x,y,z,w,m
-0.52060827,2,-0.3558505,0.12662958,2.043884
-0.21762236,2,-1.9773772,3.91002067,1.488001
-0.18114895,2,1.0182433,1.03681934,2.54525
0.01633433,2,-1.3343884,1.7805924,1.880468
0.23565964,2,-0.582134,0.33888002,2.218324
0.73513963,2,0.8413259,0.70782927,2.717312
-0.08616578,2,-0.1059262,0.01122037,2.265409
1.78127433,2,-0.4242396,0.17997922,2.668354
-0.48811455,2,-0.3842031,0.14761203,2.045332


In [47]:
# plyr-like operations
DT[ , a := x>0]
## grouped by "a"
DT[ , b := mean(x+m), by = a]

DT

x,y,z,w,m,a,b
-0.52060827,2,-0.3558505,0.12662958,2.043884,False,1.778843
-0.21762236,2,-1.9773772,3.91002067,1.488001,False,1.778843
-0.18114895,2,1.0182433,1.03681934,2.54525,False,1.778843
0.01633433,2,-1.3343884,1.7805924,1.880468,True,3.063217
0.23565964,2,-0.582134,0.33888002,2.218324,True,3.063217
0.73513963,2,0.8413259,0.70782927,2.717312,True,3.063217
-0.08616578,2,-0.1059262,0.01122037,2.265409,False,1.778843
1.78127433,2,-0.4242396,0.17997922,2.668354,True,3.063217
-0.48811455,2,-0.3842031,0.14761203,2.045332,False,1.778843


In [54]:
# special variables
set.seed(123)
DT <- data.table(x = sample(letters[1:3], size = 1E5, replace = TRUE))
head(DT)
## .N: an integer, length 1, containg the number (計算某一組值在變量中出現的次數)
### faster than table(DT$x)
DT[ , .N, by = x]
table(DT$x)

Your code contains a unicode char which cannot be displayed in your
current locale and R will silently convert it to an escaped form when the
R kernel executes this code. This can lead to subtle errors if you use
such chars to do comparisons. For more information, please see
https://github.com/IRkernel/repr/wiki/Problems-with-unicode-on-windows

x
a
c
b
c
c
a


x,N
a,33387
c,33201
b,33412



    a     b     c 
33387 33412 33201 

In [57]:
# keys
## ref: https://bit.ly/2GY2wVX
DT <- data.table(x = rep(c("a", "b", "c"), each = 100), y = rnorm(300))
setkey(DT, x)
DT['a']

## joins
DT1 <- data.table(x = c('a', 'b', 'c', 'dt1'), y = 1:4)
DT2 <- data.table(x = c('a', 'b', 'dt2'), z = 5:7)

setkey(DT1, x); setkey(DT2, x)
merge(DT, DT2)

x,y
a,0.17175499
a,-1.41402649
a,0.10359113
a,1.86508165
a,-0.78971334
a,-0.02787478
a,-0.12485269
a,0.40100076
a,1.23186934
a,-1.46367988


x,y,z
a,0.17175499,5
a,-1.41402649,5
a,0.10359113,5
a,1.86508165,5
a,-0.78971334,5
a,-0.02787478,5
a,-0.12485269,5
a,0.40100076,5
a,1.23186934,5
a,-1.46367988,5


In [61]:
# fast reading
big_df <- data.frame(x = rnorm(1E6), y = rnorm(1E6))
file <- tempfile()
write.table(big_df, file = file, row.names = F, col.names = T, 
           sep = "\t", quote = F)

system.time(fread(file))

system.time(read.table(file, header = T, sep = "\t"))

Read 26.0% of 1000000 rowsRead 42.0% of 1000000 rowsRead 57.0% of 1000000 rowsRead 72.0% of 1000000 rowsRead 87.0% of 1000000 rowsRead 1000000 rows and 2 (of 2) columns from 0.035 GB file in 00:00:08


   user  system elapsed 
   6.87    0.08    7.71 

   user  system elapsed 
  17.96    0.54   21.92 

---
# Webscraping
(not familiar enough)
## Reading XML

In [9]:
library(XML)
library(RCurl)
fileUrl <- "https://www.w3schools.com/xml/simple.xml"
doc <- xmlTreeParse(getURL(fileUrl), useInternal = T)
rootNode <- xmlRoot(doc)

# get the name of the node
xmlName(rootNode)
# get the name of every element in this node
names(rootNode)

In [4]:
# directly access parts of the XML document
rootNode[[1]]

<food>
  <name>Belgian Waffles</name>
  <price>$5.95</price>
  <description>Two of our famous Belgian Waffles with plenty of real maple syrup</description>
  <calories>650</calories>
</food> 

In [5]:
rootNode[[1]][[1]]

<name>Belgian Waffles</name> 

In [6]:
# programmatically extract parts of the file
## 第一個參數是你要使用的XML對象，第二個參數說明打算掉用哪個函數
## xmlValue() can get the XML value of each element
xmlSApply(rootNode, xmlValue)

In [8]:
# XPath (又是一個爬蟲的坑)
## /node: Top level node
## //node: Node at any level
## node[@attr-name]: Node with an attribute name
## node[@attr-name='bob']: Node with attribute name attr-name='bob'

## for example, get the items on the menu and prices
xpathSApply(rootNode, "//name", xmlValue)
xpathSApply(rootNode, "//price", xmlValue)

Your code contains a unicode char which cannot be displayed in your
current locale and R will silently convert it to an escaped form when the
R kernel executes this code. This can lead to subtle errors if you use
such chars to do comparisons. For more information, please see
https://github.com/IRkernel/repr/wiki/Problems-with-unicode-on-windows

In [15]:
# another example
fileUrl <- "http://www.espn.com/nfl/team/_/name/bal/baltimore-ravens"
doc <- htmlTreeParse(getURL(fileUrl), useInternal = T)
records <- xpathSApply(doc, "//li[@class='record']", xmlValue)
teams <- xpathSApply(doc, "//li[@class='team-name']", xmlValue)

print(records)
print(teams)

[1] "9-7" "9-7"
[1] "Baltimore RavensRavens" "Baltimore RavensRavens"


## Reading JSON
JSON = Javascript Object Notation

often used with API

In [16]:
library(jsonlite)
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")

names(jsonData)

In [19]:
# nested objects in JSON
names(jsonData$owner)

jsonData$owner$login

In [20]:
# writing data frames to JSON
## "pretty=T" can make a pretty indent
## cat() to print out the .json
myjson <- toJSON(iris, pretty = TRUE)
cat(myjson)

[
  {
    "Sepal.Length": 5.1,
    "Sepal.Width": 3.5,
    "Petal.Length": 1.4,
    "Petal.Width": 0.2,
    "Species": "setosa"
  },
  {
    "Sepal.Length": 4.9,
    "Sepal.Width": 3,
    "Petal.Length": 1.4,
    "Petal.Width": 0.2,
    "Species": "setosa"
  },
  {
    "Sepal.Length": 4.7,
    "Sepal.Width": 3.2,
    "Petal.Length": 1.3,
    "Petal.Width": 0.2,
    "Species": "setosa"
  },
  {
    "Sepal.Length": 4.6,
    "Sepal.Width": 3.1,
    "Petal.Length": 1.5,
    "Petal.Width": 0.2,
    "Species": "setosa"
  },
  {
    "Sepal.Length": 5,
    "Sepal.Width": 3.6,
    "Petal.Length": 1.4,
    "Petal.Width": 0.2,
    "Species": "setosa"
  },
  {
    "Sepal.Length": 5.4,
    "Sepal.Width": 3.9,
    "Petal.Length": 1.7,
    "Petal.Width": 0.4,
    "Species": "setosa"
  },
  {
    "Sepal.Length": 4.6,
    "Sepal.Width": 3.4,
    "Petal.Length": 1.4,
    "Petal.Width": 0.3,
    "Species": "setosa"
  },
  {
    "Sepal.Length": 5,
    "Sepal.Width": 3.4,
    "Petal.Length": 1.5,
    "Peta

In [22]:
# convert back to data frame
iris2 <- fromJSON(myjson)
head(iris2)

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa


## API
(to be continued)