-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6e272e9
commit 9fa4a6f
Showing
7 changed files
with
837 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,397 @@ | ||
\documentclass{beamer} | ||
|
||
% load packages | ||
\usepackage{tikz} | ||
\usepackage{graphicx} | ||
\usepackage{upquote} | ||
\usepackage{listings} | ||
\usepackage{hyperref} | ||
\usepackage{color} | ||
\usepackage{lmodern} | ||
|
||
\input{../header.tex} | ||
|
||
\title[Getting data from the web with R]{\LARGE Getting Data from the Web with R} | ||
\subtitle[Web Data in R]{\large Part 1: Introduction} | ||
\author[gastonsanchez.com]{ | ||
\textcolor{gray}{\textbf{G}aston \textbf{S}anchez} | ||
} | ||
\institute[]{\scriptsize \textcolor{lightgray}{April-May 2014}} | ||
\date[CC BY-SA-NC 4.0]{ | ||
\textcolor{lightgrey}{\tiny{Content licensed under | ||
\href{http://creativecommons.org/licenses/by-nc-sa/4.0/}{CC BY-NC-SA 4.0}}} | ||
} | ||
|
||
|
||
\begin{document} | ||
|
||
<<setup, include=FALSE>>= | ||
# smaller font size for chunks | ||
opts_chunk$set(size = 'tiny') | ||
thm <- knit_theme$get("bclear") | ||
knit_theme$set(thm) | ||
options(width=78) | ||
@ | ||
|
||
|
||
%--- the titlepage frame -------------------------% | ||
|
||
\begin{frame}[plain] | ||
\titlepage | ||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
{ % all template changes are local to this group. | ||
\setbeamertemplate{navigation symbols}{} | ||
\begin{frame}[plain] | ||
\begin{tikzpicture}[remember picture,overlay] | ||
\node[at=(current page.center)] { | ||
\includegraphics[height=\paperheight]{images/jaume_nit.jpg} | ||
}; | ||
\node[fill=black, opacity=0, text opacity=1] at (5.5,-3.8) {\large{ \color{white} Getting Data from the Web with R}}; | ||
\end{tikzpicture} | ||
\end{frame} | ||
} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame}[fragile] | ||
\frametitle{Readme} | ||
|
||
\begin{block}{\scriptsize License:} | ||
\tiny | ||
\begin{itemize} | ||
\item[] Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License \\ | ||
\url{http://creativecommons.org/licenses/by-nc-sa/4.0/}{} | ||
\end{itemize} | ||
\end{block} | ||
|
||
\begin{block}{\scriptsize You are free to:} | ||
\tiny | ||
\begin{itemize} | ||
\item[] \textcolor{darkgray}{\textbf{Share}} --- \textcolor{gray}{copy and redistribute the material} | ||
\item[] \textcolor{darkgray}{\textbf{Adapt}} --- \textcolor{gray}{rebuild and transform the material} | ||
\end{itemize} | ||
\end{block} | ||
|
||
\vspace{2mm} | ||
\begin{block}{\scriptsize Under the following conditions:} | ||
\tiny | ||
\begin{itemize} | ||
\item[] \textcolor{darkgray}{\textbf{Attribution}} --- \textcolor{gray}{You must give appropriate credit, provide a link to the license, and indicate if changes were made.} | ||
\item[] \textcolor{darkgray}{\textbf{NonCommercial}} --- \textcolor{gray}{You may not use this work for commercial purposes.} | ||
\item[] \textcolor{darkgray}{\textbf{Share Alike}} --- \textcolor{gray}{If you remix, transform, or build upon this | ||
work, you must distribute your contributions under the same license to this one.} | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{Lectures Menu} | ||
|
||
\begin{columns}[t] | ||
\begin{column}{0.1\textwidth} | ||
%--- empty space ---% | ||
\end{column} | ||
\begin{column}{0.8\textwidth} | ||
\begin{block}{Slide Decks} | ||
\begin{enumerate} | ||
\item \textbf{Introduction} | ||
\item \textcolor{lightgray}{Reading files from the Web} | ||
\item \textcolor{lightgray}{Basics of XML and HTML} | ||
\item \textcolor{lightgray}{Parsing XML / HTML content} | ||
\item \textcolor{lightgray}{Handling JSON data} | ||
\item \textcolor{lightgray}{HTTP Basics and the RCurl Package} | ||
\item \textcolor{lightgray}{Getting data via Web Forms} | ||
\item \textcolor{lightgray}{Getting data via Web APIs} | ||
\end{enumerate} | ||
\end{block} | ||
\end{column} | ||
\begin{column}{0.1\textwidth} | ||
%--- empty space ---% | ||
\end{column} | ||
\end{columns} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{About these lectures} | ||
|
||
\begin{columns}[t] | ||
\begin{column}{0.1\textwidth} | ||
%--- empty space ---% | ||
\end{column} | ||
\begin{column}{0.8\textwidth} | ||
\begin{block}{Goal} | ||
My goal is \textbf{to give you an introduction} to some of the tools in R for getting data from the Web. | ||
|
||
\bigskip | ||
I don't pretend to cover everything nor going very deep. I just want to show you an overview of various Web Data scenarios you can handle with R. | ||
\end{block} | ||
\end{column} | ||
\begin{column}{0.1\textwidth} | ||
%--- empty space ---% | ||
\end{column} | ||
\end{columns} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\begin{center} | ||
\Huge{\textcolor{mandarina}{Preliminaries}} | ||
\end{center} | ||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{Requirements} | ||
|
||
\begin{block}{Must have:} | ||
\begin{itemize} | ||
\item Some experience working with R | ||
\item Some knowledge of HTML | ||
\item An insatiable curiosity for learning new things | ||
\end{itemize} | ||
\end{block} | ||
|
||
\begin{block}{Nice to have:} | ||
\begin{itemize} | ||
\item Knowledge about data storage formats | ||
\item Some programming experience | ||
\item Knowledge on how the Web works | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{Software} | ||
|
||
\begin{block}{You'll need:} | ||
\begin{itemize} | ||
\item R \textcolor{lightgray}{(preferably the last version)} \\ \url{http://cran.r-project.org/} | ||
\item RStudio \textcolor{lightgray}{(highly recommended)} \\ \url{https://www.rstudio.com/} | ||
\item Text Editor \\ \textcolor{lightgray}{(eg vim, emacs, TextWrangler, notepad, sublime text)} | ||
\item Web Browser \\ \textcolor{lightgray}{(eg Chrome, Safari, Firefox, Internet Explorer, Opera)} | ||
\item and a good internet connection! | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{In my case ...} | ||
|
||
\begin{block}{Software I used for these slides:} | ||
\begin{itemize} | ||
\item R version 3.1.0 (2014-04-10) -- "Spring Dance" | ||
\item Platform: x86\_64-apple-darwin10.8.0 (64-bit) | ||
\item IDE: RStudio Version 0.98.501 | ||
\item Text Editor: TextWrangler | ||
\item Web Browser: Google Chrome Version 34.0.1847.131 | ||
\item Operating System: OS-X Version 10.8.5 | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{Resources} | ||
|
||
\begin{block}{Some R Books} | ||
\begin{itemize} | ||
\item XML and Web Technologies for Data Sciences with R \\ | ||
\low{by Deb Nolan and Duncan Temple Lang} | ||
\item Introduction to Data Technologies \\ | ||
\low{by Duncan Murdoch} | ||
\item Data Manipulation with R \\ | ||
\low{by Phil Spector} | ||
\item more references in each slide deck | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{Resources} | ||
|
||
\begin{block}{Web Scraping with R} | ||
\begin{itemize} | ||
\item Web scraping for the humanities and social sciences \\ | ||
\low{(by Rolf Fredheim and Aiora Zabala)} \\ | ||
{\tiny \url{http://quantifyingmemory.blogspot.co.uk/2014/02/web-scraping-basics.html}} | ||
\item Web Scraping with R \low{(by Xian Nan)} \\ | ||
{\tiny \url{http://cos.name/wp-content/uploads/2013/05/Web-Scraping-with-R-XiaoNan.pdf}} | ||
\item R-bloggers posts on \textit{Web Scraping} \\ | ||
{\tiny \url{http://www.r-bloggers.com/?s=web+scraping}} | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{Some R Packages} | ||
|
||
\begin{center} | ||
\begin{tabular}{l l} | ||
\hline | ||
Package & Description \\ | ||
\hline | ||
\highcode{RCurl} & R interface to the \code{libcurl} library \\ | ||
& for making general HTTP requests \\ | ||
\highcode{RHTMLForms} & Tools to process Web/HTML forms \\ | ||
\highcode{XML} & Tools for parsing XML and HTML documents \\ | ||
& and working with structured data from the Web \\ | ||
\highcode{RJSONIO} & Functions for handling JSON data \\ | ||
\highcode{jsonlite} & Functions for handling JSON data \\ | ||
\highcode{rjson} & Functions for handling JSON data \\ | ||
\highcode{ROAuth} & Interface for authentication via OAuth 1.0 \\ | ||
\highcode{SSOAP} & Use SOAP protocol to retrieve data \\ | ||
\hline | ||
\end{tabular} | ||
\end{center} | ||
|
||
CRAN Task View: \textit{Web Technologies and Services} \\ | ||
{\scriptsize \url{http://cran.r-project.org/web/views/WebTechnologies.html}} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
{ % all template changes are local to this group. | ||
\setbeamertemplate{navigation symbols}{} | ||
\begin{frame}[plain] | ||
\begin{tikzpicture}[remember picture,overlay] | ||
\node[at=(current page.center)] { | ||
\includegraphics[width=\paperwidth]{images/spider_web.jpg} | ||
}; | ||
\node[fill=black, opacity=0, text opacity=1] at (7.5,-2.8) {\Huge{ \color{white} The Web}}; | ||
\end{tikzpicture} | ||
\end{frame} | ||
} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{VIP Questions} | ||
|
||
\begin{columns}[t] | ||
\begin{column}{0.1\textwidth} | ||
%--- empty space ---% | ||
\end{column} | ||
\begin{column}{0.8\textwidth} | ||
\begin{block}{Very Important Preliminary Questions} | ||
The Data that you want: | ||
\begin{enumerate} | ||
\item Where is it located? | ||
\item How accessible is it? | ||
\item What is its structure / format? | ||
\end{enumerate} | ||
\end{block} | ||
\end{column} | ||
\begin{column}{0.1\textwidth} | ||
%--- empty space ---% | ||
\end{column} | ||
\end{columns} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{VIP Questions} | ||
|
||
\begin{block}{Location of Data} | ||
\begin{itemize} | ||
\item Do you know the location (URL) beforehand? | ||
\item[] Or do you have to figure it out? \\ | ||
\item Is it in one single specific place? \\ | ||
\low{(eg one HTML table, one file in the Web)} | ||
\item Is it in one website but spread across several pages? \\ | ||
\low{(eg several HTML tables at different pages)} | ||
\item Is it spread across several websites? \\ | ||
\low{(eg multiple pieces of information in various sites)} | ||
\item Is it in one or several databases? | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{VIP Questions} | ||
|
||
\begin{block}{Accessibility of Data} | ||
\begin{itemize} | ||
\item Do you have free direct immediate access to data? | ||
\item Do you need to fill a Web Form? | ||
\item Do you need to use a Web API? | ||
\item Do you require username, password, authentication? | ||
\item Do you need to use a specifc transfer protocol? | ||
\item Do you need to use a specifc type/method of request? | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{VIP Questions} | ||
|
||
\begin{block}{Format / Structure of Data} | ||
\begin{itemize} | ||
\item Is it plain text? \\ | ||
\item Is it in tabular \low{(spreadsheet-like)} form? \\ | ||
\item Is it in HTML? \\ | ||
\item Is it in some XML-dialect? | ||
\item Is it in JSON format? | ||
\item Other formats: binary, images, maps, etc? | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\begin{frame} | ||
\frametitle{Glossary} | ||
|
||
\begin{block}{Some Acronyms} | ||
\begin{itemize} | ||
\item \textbf{WWW} World Wide Web | ||
\item \textbf{W3C} World Wide Web Consortium | ||
\item \textbf{URL} Uniform Resource Locator | ||
\item \textbf{HTTP} HyperText Transfer Protocol | ||
\item \textbf{XML} Extensible Markup Language | ||
\item \textbf{HTML} HyperText Markup Language | ||
\item \textbf{JSON} JavaScript Object Notation | ||
\end{itemize} | ||
\end{block} | ||
|
||
\end{frame} | ||
|
||
%------------------------------------------------ | ||
|
||
\end{document} |
Binary file not shown.
Oops, something went wrong.