# Introduction to dplyr and tbls

## Section 1 - Introduction to dplyr

* dplyr: a grammar of data manipulation
* tbl: a data structure
* %>%(pipeline)

In [4]:
# Load the dplyr package
library(dplyr)

# Load the hflights package
library(hflights)

# Call both head() and summary() on hflights
head(hflights)
summary(hflights)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
5424,2011,1,1,6,1400,1500,AA,428,N576AA,60,...,-10,0,IAH,DFW,224,7,13,0,,0
5425,2011,1,2,7,1401,1501,AA,428,N557AA,60,...,-9,1,IAH,DFW,224,6,9,0,,0
5426,2011,1,3,1,1352,1502,AA,428,N541AA,70,...,-8,-8,IAH,DFW,224,5,17,0,,0
5427,2011,1,4,2,1403,1513,AA,428,N403AA,70,...,3,3,IAH,DFW,224,9,22,0,,0
5428,2011,1,5,3,1405,1507,AA,428,N492AA,62,...,-3,5,IAH,DFW,224,9,9,0,,0
5429,2011,1,6,4,1359,1503,AA,428,N262AA,64,...,-7,-1,IAH,DFW,224,6,13,0,,0


      Year          Month          DayofMonth      DayOfWeek        DepTime    
 Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1  
 1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.:1021  
 Median :2011   Median : 7.000   Median :16.00   Median :4.000   Median :1416  
 Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948   Mean   :1396  
 3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1801  
 Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2400  
                                                                 NA's   :2905  
    ArrTime     UniqueCarrier        FlightNum      TailNum         
 Min.   :   1   Length:227496      Min.   :   1   Length:227496     
 1st Qu.:1215   Class :character   1st Qu.: 855   Class :character  
 Median :1617   Mode  :character   Median :1696   Mode  :character  
 Mean   :1578                      Mean   :1962                     
 3rd Qu.:1953  

## Section 2 - tbl, a special type of data.frame

### tbl

* tbl: A special type of data frame
* glimpse: A summarizing function similar to summary

### Convert data.frame to tibble

In [6]:
# Both the dplyr and hflights packages are loaded

# Convert the hflights data.frame into a hflights tbl
hflights <- tbl_df(hflights)

# Display the hflights tbl
hflights

# Create the object carriers
carriers <- hflights[["UniqueCarrier"]]

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
5424,2011,1,1,6,1400,1500,AA,428,N576AA,60,...,-10,0,IAH,DFW,224,7,13,0,,0
5425,2011,1,2,7,1401,1501,AA,428,N557AA,60,...,-9,1,IAH,DFW,224,6,9,0,,0
5426,2011,1,3,1,1352,1502,AA,428,N541AA,70,...,-8,-8,IAH,DFW,224,5,17,0,,0
5427,2011,1,4,2,1403,1513,AA,428,N403AA,70,...,3,3,IAH,DFW,224,9,22,0,,0
5428,2011,1,5,3,1405,1507,AA,428,N492AA,62,...,-3,5,IAH,DFW,224,9,9,0,,0
5429,2011,1,6,4,1359,1503,AA,428,N262AA,64,...,-7,-1,IAH,DFW,224,6,13,0,,0
5430,2011,1,7,5,1359,1509,AA,428,N493AA,70,...,-1,-1,IAH,DFW,224,12,15,0,,0
5431,2011,1,8,6,1355,1454,AA,428,N477AA,59,...,-16,-5,IAH,DFW,224,7,12,0,,0
5432,2011,1,9,7,1443,1554,AA,428,N476AA,71,...,44,43,IAH,DFW,224,8,22,0,,0
5433,2011,1,10,1,1443,1553,AA,428,N504AA,70,...,43,43,IAH,DFW,224,6,19,0,,0


### Changing labels of hflights, part 1 of 2

In [7]:
# Both the dplyr and hflights packages are loaded into workspace
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", 
         "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", 
         "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", 
         "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")

# Add the Carrier column to hflights
hflights$Carrier <- lut[hflights$UniqueCarrier]

# Glimpse at hflights
glimpse(hflights)

Observations: 227,496
Variables: 22
$ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2...
$ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
$ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1...
$ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1...
$ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1...
$ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "...
$ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,...
$ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",...
$ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6...
$ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4...
$ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -...
$ DepDelay      

### Changing labels of hflights, part 2 of 2

In [8]:
# The hflights tbl you built in the previous exercise is available in the workspace.

# The lookup table
lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA", "D" = "security", "E" = "not cancelled")

# Add the Code column
hflights$Code <- lut[hflights$CancellationCode]

# Glimpse at hflights
glimpse(hflights)

Observations: 227,496
Variables: 23
$ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2...
$ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
$ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1...
$ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1...
$ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1...
$ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "...
$ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,...
$ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",...
$ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6...
$ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4...
$ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -...
$ DepDelay      