# Section 5 - The third of five verbs: filter

## filter

* filter(tbl, logical operation)

### Logical operators

* x < y, TRUE if x is less than y
* x <= y, TRUE if x is less than or equal to y
* x == y, TRUE if x equals y
* x != y, TRUE if x does not equal y
* x >= y, TRUE if x is greater than or equal to y
* x > y, TRUE if x is greater than y
* x %in% c(a, b, c), TRUE if x is in the vector c(a, b, c)

In [4]:
library(dplyr)
library(hflights)

# hflights is at your disposal as a tbl, with clean carrier names

# All flights that traveled 3000 miles or more
filter(hflights, Distance > 3000)

# All flights flown by one of JetBlue, Southwest, or Delta
filter(hflights, UniqueCarrier %in% c("JetBlue", "Southwest", "Delta"))

# All flights where taxiing took longer than flying
filter(hflights, AirTime < (TaxiIn + TaxiOut))

Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,1,31,1,924,1413,CO,1,N69063,529,...,23,-1,IAH,HNL,3904,6,31,0,,0
2011,1,30,7,925,1410,CO,1,N76064,525,...,20,0,IAH,HNL,3904,13,19,0,,0
2011,1,29,6,1045,1445,CO,1,N69063,480,...,55,80,IAH,HNL,3904,4,17,0,,0
2011,1,28,5,1516,1916,CO,1,N77066,480,...,326,351,IAH,HNL,3904,7,10,0,,0
2011,1,27,4,950,1344,CO,1,N76055,474,...,-6,25,IAH,HNL,3904,4,15,0,,0
2011,1,26,3,944,1350,CO,1,N76065,486,...,0,19,IAH,HNL,3904,5,10,0,,0
2011,1,25,2,924,1337,CO,1,N68061,493,...,-13,-1,IAH,HNL,3904,5,15,0,,0
2011,1,24,1,1144,1605,CO,1,N76064,501,...,135,139,IAH,HNL,3904,7,30,0,,0
2011,1,23,7,926,1335,CO,1,N76065,489,...,-15,1,IAH,HNL,3904,6,17,0,,0
2011,1,22,6,942,1340,CO,1,N69063,478,...,-10,17,IAH,HNL,3904,3,10,0,,0


"number of rows of result is not a multiple of vector length (arg 2)"

Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted


Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,1,24,1,731,904,AA,460,N545AA,93,...,29,11,IAH,DFW,224,14,37,0,,0
2011,1,30,7,1959,2132,AA,533,N455AA,93,...,12,-6,IAH,DFW,224,10,40,0,,0
2011,1,24,1,1621,1749,AA,1121,N484AA,88,...,4,-9,IAH,DFW,224,10,35,0,,0
2011,1,10,1,941,1113,AA,1436,N591AA,92,...,48,31,IAH,DFW,224,27,20,0,,0
2011,1,31,1,1301,1356,CO,241,N14629,55,...,-2,-4,IAH,AUS,140,5,23,0,,0
2011,1,31,1,2113,2215,CO,1533,N72405,62,...,20,13,IAH,AUS,140,7,25,0,,0
2011,1,31,1,1434,1539,CO,1541,N16646,65,...,15,4,IAH,AUS,140,5,30,0,,0
2011,1,31,1,900,1006,CO,1583,N36207,66,...,10,0,IAH,AUS,140,5,29,0,,0
2011,1,30,7,1304,1408,CO,241,N14645,64,...,10,-1,IAH,AUS,140,6,27,0,,0
2011,1,30,7,2004,2128,CO,423,N16632,84,...,54,39,IAH,MSY,305,10,34,0,,0


### Combining tests using boolean operators

In [5]:
# hflights is at your service as a tbl!

# All flights that departed before 5am or arrived after 10pm
filter(hflights, DepTime < 500 | ArrTime > 2200)

# All flights that departed late but arrived ahead of schedule
filter(hflights, DepDelay > 0 & ArrDelay < 0)

# All flights that were cancelled after being delayed
filter(hflights, Cancelled & DepDelay > 0)

Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,1,4,2,2100,2207,AA,533,N4XGAA,67,...,47,55,IAH,DFW,224,3,22,0,,0
2011,1,14,5,2119,2229,AA,533,N549AA,70,...,69,74,IAH,DFW,224,5,20,0,,0
2011,1,10,1,1934,2235,AA,1294,N3BXAA,121,...,80,99,IAH,MIA,964,3,11,0,,0
2011,1,26,3,1905,2211,AA,1294,N3BXAA,126,...,56,70,IAH,MIA,964,5,10,0,,0
2011,1,30,7,1856,2209,AA,1294,N3CPAA,133,...,54,61,IAH,MIA,964,7,18,0,,0
2011,1,9,7,1938,2228,AS,731,N609AS,290,...,78,73,IAH,SEA,1874,5,32,0,,0
2011,1,31,1,1919,2231,CO,190,N35260,132,...,-12,-1,IAH,MIA,964,5,20,0,,0
2011,1,31,1,2116,2344,CO,209,N24715,268,...,-15,-7,IAH,PDX,1825,4,8,0,,0
2011,1,31,1,1850,2211,CO,250,N59630,141,...,-18,0,IAH,RDU,1043,5,15,0,,0
2011,1,31,1,2102,2216,CO,299,N17244,134,...,-10,8,IAH,DEN,862,6,9,0,,0


Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,1,2,7,1401,1501,AA,428,N557AA,60,...,-9,1,IAH,DFW,224,6,9,0,,0
2011,1,5,3,1405,1507,AA,428,N492AA,62,...,-3,5,IAH,DFW,224,9,9,0,,0
2011,1,18,2,1408,1508,AA,428,N507AA,60,...,-2,8,IAH,DFW,224,7,11,0,,0
2011,1,18,2,721,827,AA,460,N558AA,66,...,-8,1,IAH,DFW,224,7,13,0,,0
2011,1,12,3,2015,2113,AA,533,N555AA,58,...,-7,10,IAH,DFW,224,9,10,0,,0
2011,1,13,4,2020,2116,AA,533,N4XCAA,56,...,-4,15,IAH,DFW,224,4,8,0,,0
2011,1,26,3,2009,2103,AA,533,N403AA,54,...,-17,4,IAH,DFW,224,9,6,0,,0
2011,1,1,6,1631,1736,AA,1121,N4WVAA,65,...,-9,1,IAH,DFW,224,16,12,0,,0
2011,1,10,1,1639,1740,AA,1121,N531AA,61,...,-5,9,IAH,DFW,224,8,12,0,,0
2011,1,12,3,1631,1739,AA,1121,N468AA,68,...,-6,1,IAH,DFW,224,5,19,0,,0


Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,1,26,3,1926,,CO,310,N77865,,...,,26,IAH,EWR,1400,,,1,B,0
2011,1,11,2,1100,,US,944,N452UW,,...,,135,IAH,CLT,913,,,1,B,0
2011,1,19,3,1811,,XE,2376,N15932,,...,,6,IAH,ICT,542,,,1,B,0
2011,1,7,5,2028,,XE,3050,N15912,,...,,73,IAH,JAX,817,,19.0,1,A,0
2011,2,4,5,1638,,AA,1121,N537AA,,...,,8,IAH,DFW,224,,19.0,1,A,0
2011,2,8,2,1057,,CO,408,N11641,,...,,187,IAH,EWR,1400,,,1,A,0
2011,2,2,3,802,,XE,2189,N17928,,...,,2,IAH,DAL,217,,,1,B,0
2011,2,9,3,904,,XE,2605,N15941,,...,,4,IAH,DAL,217,,,1,B,0
2011,2,1,2,1508,,OO,5812,N959SW,,...,,28,IAH,ATL,689,,19.0,1,A,0
2011,3,31,4,1016,,CO,586,N19136,,...,,156,IAH,MCO,853,,,1,B,0


### Blend together what you've learned

In [6]:
# hflights is already available in the workspace

# Select the flights that had JFK as their destination: c1
c1 <- filter(hflights, Dest == "JFK")

# Combine the Year, Month and DayofMonth variables to create a Date column: c2
c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep = "-"))

# Print out a selection of columns of c2
select(c2, Date, DepTime, ArrTime, TailNum)

Date,DepTime,ArrTime,TailNum
2011-1-1,654,1124,N324JB
2011-1-1,1639,2110,N324JB
2011-1-2,703,1113,N324JB
2011-1-2,1604,2040,N324JB
2011-1-3,659,1100,N229JB
2011-1-3,1801,2200,N206JB
2011-1-4,654,1103,N267JB
2011-1-4,1608,2034,N267JB
2011-1-5,700,1103,N708JB
2011-1-5,1544,1954,N644JB


### Recap on select, mutate, and filter

* How many weekend flights flew a distance of more than 1000 miles but had a total taxiing time below 15 minutes?

In [20]:
names(hflights)

filter(hflights, DayOfWeek > 5 & Distance > 1000 & TaxiIn + TaxiOut < 15)

Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,1,23,7,1535,1933,B6,624,N599JB,178,...,-27,0,HOU,JFK,1428,6,8,0,,0
2011,1,30,7,851,1230,CO,1058,N39726,159,...,-13,-2,IAH,DCA,1208,3,11,0,,0
2011,1,30,7,2234,2,CO,1717,N38417,208,...,89,94,IAH,SAN,1303,3,10,0,,0
2011,1,29,6,1220,1353,CO,1620,N87512,153,...,19,45,IAH,PHX,1009,5,9,0,,0
2011,1,23,7,847,1213,CO,1058,N16709,146,...,-30,-6,IAH,DCA,1208,4,8,0,,0
2011,1,23,7,1224,1345,CO,1629,N39728,201,...,-27,-1,IAH,SNA,1347,4,9,0,,0
2011,1,23,7,931,1045,CO,1723,N37277,194,...,-28,-5,IAH,ONT,1334,3,10,0,,0
2011,1,22,6,942,1340,CO,1,N69063,478,...,-10,17,IAH,HNL,3904,3,10,0,,0
2011,1,16,7,848,1136,CO,309,N77510,288,...,12,-2,IAH,PDX,1825,5,8,0,,0
2011,1,16,7,1030,1406,CO,358,N76522,156,...,-14,-5,IAH,DCA,1208,3,10,0,,0


## Section 6 - Almost there: the arrange verb

* Reorders the rows of a dataset based on their contents
* arrange(tbl, column name to reorder by)

### Arranging your data

In [21]:
# dplyr and the hflights tbl are available

# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))

# Arrange dtc by departure delays
arrange(dtc, DepDelay)

# Arrange dtc so that cancellation reasons are grouped
arrange(dtc, CancellationCode)

# Arrange dtc according to carrier and departure delays
arrange(dtc, UniqueCarrier, DepDelay)

Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,7,23,6,605,,F9,225,N912FR,,...,,-10,HOU,DEN,883,,10,1,A,0
2011,1,17,1,916,,XE,3068,N13936,,...,,-9,IAH,HRL,295,,,1,B,0
2011,12,1,4,541,,US,282,N840AW,,...,,-9,IAH,PHX,1009,,,1,A,0
2011,10,12,3,2022,,MQ,3724,N539MQ,,...,,-8,IAH,LAX,1379,,,1,A,0
2011,7,29,5,1424,,CO,1079,N14628,,...,,-6,IAH,ORD,925,,13,1,A,0
2011,9,29,4,1639,,OO,2062,N724SK,,...,,-6,IAH,ATL,689,,,1,B,0
2011,2,9,3,555,,MQ,3265,N613MQ,,...,,-5,HOU,DFW,247,,11,1,A,0
2011,5,9,1,715,,OO,1177,N758SK,,...,,-5,IAH,DTW,1076,,17,1,A,0
2011,1,20,4,1413,,UA,552,N509UA,,...,,-4,IAH,IAD,1190,,,1,A,0
2011,1,17,1,831,,WN,1,N714CB,,...,,-4,HOU,HRL,276,,8,1,B,0


Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,1,20,4,1413,,UA,552,N509UA,,...,,-4,IAH,IAD,1190,,,1,A,0
2011,1,7,5,2028,,XE,3050,N15912,,...,,73,IAH,JAX,817,,19,1,A,0
2011,2,4,5,1638,,AA,1121,N537AA,,...,,8,IAH,DFW,224,,19,1,A,0
2011,2,8,2,1057,,CO,408,N11641,,...,,187,IAH,EWR,1400,,,1,A,0
2011,2,1,2,1508,,OO,5812,N959SW,,...,,28,IAH,ATL,689,,19,1,A,0
2011,2,21,1,2257,,OO,1111,N778SK,,...,,-3,IAH,AUS,140,,,1,A,0
2011,2,9,3,555,,MQ,3265,N613MQ,,...,,-5,HOU,DFW,247,,11,1,A,0
2011,3,18,5,727,,UA,109,N469UA,,...,,-3,IAH,DEN,862,,,1,A,0
2011,4,4,1,1632,,DL,8,N600TR,,...,,42,IAH,ATL,689,,,1,A,0
2011,4,8,5,1608,,WN,4,N365SW,,...,,548,HOU,DAL,239,,,1,A,0


Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,8,18,4,1808,,AA,1294,N3FLAA,,...,,3,IAH,MIA,964,,,1,A,0
2011,2,4,5,1638,,AA,1121,N537AA,,...,,8,IAH,DFW,224,,19,1,A,0
2011,7,29,5,1424,,CO,1079,N14628,,...,,-6,IAH,ORD,925,,13,1,A,0
2011,1,26,3,1703,,CO,410,N77296,,...,,0,IAH,IAD,1190,,13,1,B,0
2011,8,11,4,1320,,CO,1669,N73275,,...,,0,IAH,MIA,964,,,1,A,0
2011,7,25,1,1654,,CO,1422,N58606,,...,,24,IAH,ATL,689,,,1,C,0
2011,1,26,3,1926,,CO,310,N77865,,...,,26,IAH,EWR,1400,,,1,B,0
2011,3,31,4,1016,,CO,586,N19136,,...,,156,IAH,MCO,853,,,1,B,0
2011,2,8,2,1057,,CO,408,N11641,,...,,187,IAH,EWR,1400,,,1,A,0
2011,4,4,1,1632,,DL,8,N600TR,,...,,42,IAH,ATL,689,,,1,A,0


### Reverse the order of arranging

By default, arrange() arranges the rows from smallest to largest. Rows with the smallest value of the variable will appear at the top of the data set. You can reverse this behavior with the desc() function. arrange() will reorder the rows from largest to smallest values of a variable if you wrap the variable name in desc() before passing it to arrange().

In [22]:
# dplyr and the hflights tbl are available

# Arrange according to carrier and decreasing departure delays
arrange(hflights, UniqueCarrier, desc(DepDelay))

# Arrange flights by total delay (normal order).
arrange(hflights, DepDelay + ArrDelay)

Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,12,12,1,650,808,AA,1740,N473AA,78,...,978,970,IAH,DFW,224,14,15,0,,0
2011,11,19,6,1752,1910,AA,1903,N495AA,78,...,685,677,IAH,DFW,224,7,31,0,,0
2011,12,22,4,1728,1848,AA,1903,N580AA,80,...,663,653,IAH,DFW,224,8,32,0,,0
2011,10,23,7,2305,2,AA,742,N548AA,57,...,507,525,IAH,DFW,224,5,13,0,,0
2011,9,27,2,1206,1300,AA,1948,N4YUAA,54,...,265,286,IAH,DFW,224,10,7,0,,0
2011,3,17,4,1647,1747,AA,1505,N584AA,60,...,262,277,IAH,DFW,224,7,12,0,,0
2011,6,21,2,955,1315,AA,466,N3FTAA,140,...,230,235,IAH,MIA,964,9,11,0,,0
2011,5,20,5,2359,130,AA,426,N565AA,91,...,255,234,IAH,DFW,224,8,13,0,,0
2011,4,19,2,2023,2142,AA,1925,N467AA,79,...,242,233,IAH,DFW,224,11,18,0,,0
2011,5,12,4,2133,53,AA,1294,N3AYAA,140,...,223,228,IAH,MIA,964,5,14,0,,0


Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
2011,7,3,7,1914,2039,XE,2804,N12157,85,...,-70,-1,IAH,MEM,468,4,15,0,,0
2011,8,31,3,934,1039,OO,2040,N783SK,185,...,-56,-11,IAH,BFL,1428,3,10,0,,0
2011,8,21,7,935,1039,OO,2001,N767SK,184,...,-56,-10,IAH,BFL,1428,3,10,0,,0
2011,8,28,7,2059,2206,OO,2003,N783SK,187,...,-54,-11,IAH,BFL,1428,5,11,0,,0
2011,8,29,1,935,1041,OO,2040,N767SK,186,...,-54,-10,IAH,BFL,1428,4,13,0,,0
2011,12,25,7,741,926,OO,4591,N814SK,165,...,-57,-4,IAH,SLC,1195,4,14,0,,0
2011,1,30,7,620,812,OO,4461,N804SK,172,...,-49,-10,IAH,SLC,1195,5,11,0,,0
2011,8,3,3,1741,1810,XE,2603,N11107,89,...,-40,-19,IAH,HOB,501,5,11,0,,0
2011,8,4,4,930,1041,OO,1171,N715SK,191,...,-49,-10,IAH,BFL,1428,4,10,0,,0
2011,8,18,4,939,1043,OO,2001,N783SK,184,...,-52,-6,IAH,BFL,1428,4,8,0,,0


### Recap on select, mutate, filter and arrange

Four down, one more to go! As you might have noticed, your data analysis possibilities expand with every dplyr verb you learn. Can you find the appropriate strategy for the following problem?

* What steps do you take to print only the TailNum of hflights, only for flights that departed too late, sorted by total taxiing time?

    - A: First filter(), then mutate(), arrange() and finally select().