# Introduction to R

## Up to you 1: Loading our first package

Load the tidyverse package

In [10]:
library(tidyverse)

"package 'tidyverse' was built under R version 3.4.4"-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.1       v purrr   0.3.2  
v tibble  2.1.1       v dplyr   0.8.0.1
v tidyr   0.8.3       v stringr 1.4.0  
v readr   1.3.1       v forcats 0.4.0  
"package 'forcats' was built under R version 3.4.4"-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


## Vectors

In [10]:
c(1.2, 1.3, 1.4) #numericVector

c(TRUE, FALSE) #logicalVector

In [11]:
rep(10,4)

In [12]:
seq(1,100,10)

In [15]:
vec = c(1:10, 1:5)

In [16]:
length(vec)
summary(vec)
head(vec, 3)
tail(vec, 3)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.500   4.000   4.667   6.500  10.000 

In [17]:
min(vec)
max(vec)
sum(vec)
sum(vec[vec>=5])
mean(vec)
quantile(vec)

## More Vector Manipulations

In [None]:
x <- 1:5;

In [21]:
x <- 1:5;
sample(x, 4)
sample(x, 6, replace = TRUE)

In [23]:
paste(LETTERS[1:3], 1:6, sep="")
paste(rep("A", times=6), 1:6, sep="")

## Up to you 2: Throw the dice

### Create a vector to mimick throws with a standard dice with 100, 1000 and 10000 throws. Save these simulations as individual vectors (named sim100, sim1000 and sim10000)

In [3]:
sim100 = sample(1:6, 100, replace = T)
sim1000 = sample(1:6, 1000, replace = T)
sim10000 = sample(1:6, 10000, replace = T)

### For each simulation calculate the mean and the standard deviation of the throws

In [4]:
mean(sim100)
mean(sim1000)
mean(sim10000)
sd(sim100)
sd(sim1000)
sd(sim10000)

### Tabularize and summarize the 10000 throws data


In [6]:
table(sim10000)
summary(sim10000)

sim10000
   1    2    3    4    5    6 
1599 1642 1709 1681 1762 1607 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   4.000   3.519   5.000   6.000 

## Data Frames

In [25]:
mtcars

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [26]:
mtcars$mpg

In [27]:
df1 = data.frame(numbers=c(1,2,3), letters=c("a","b","c"))
df2 = data.frame(numbers=c(4), letters=c("d"))
rbind(df1,df2)

numbers,letters
1,a
2,b
3,c
4,d


In [28]:
df1 = data.frame(numbers=c(1,2,3,4))
df2 = data.frame(letters=c("a","b","c","d"))
cbind(df1,df2)

numbers,letters
1,a
2,b
3,c
4,d


## Up to you 4: Purchasing Data

### Load Data

In [30]:
purchase = read.csv2("data/NB1/purchaseorder.csv", header=TRUE, sep =";", dec=".")

### Inspect Data

In [11]:
glimpse(purchase)

Observations: 94
Variables: 10
$ Supplier           <fct> Alum Sheeting, Alum Sheeting, Alum Sheeting, Alu...
$ Order.No.          <fct> A0223, A0433, A0443, A0446, B0247, B0447, B0479,...
$ Item.No.           <int> 4224, 5417, 1243, 5417, 1243, 5634, 5634, 1243, ...
$ Item.Description   <fct> Bolt-nut package, Control Panel, Airframe fasten...
$ Item.Cost          <dbl> 3.95, 255.00, 4.25, 255.00, 4.25, 185.00, 185.00...
$ Quantity           <int> 4500, 500, 10000, 406, 9000, 150, 140, 10500, 12...
$ Cost.per.order     <dbl> 17775.00, 127500.00, 42500.00, 103530.00, 38250....
$ A.P.Terms..Months. <int> 30, 30, 30, 30, 30, 30, 30, 30, 45, 45, 45, 45, ...
$ Order.Date         <fct> 15.10.2011, 20.10.2011, 08.08.2011, 01.09.2011, ...
$ Arrival.Date       <fct> 20.10.2011, 27.10.2011, 14.08.2011, 10.09.2011, ...


### Run Analyses

In [14]:
print("Minimum qty")
min(purchase$Quantity)
print("Maximum Quantity")
max(purchase$Quantity)
print("Total Order Cost")
sum(purchase$Cost.per.order)
print("Average Number of A/P Months")
mean(purchase$A.P.Terms..Months.)
print("Number of Purchase Orders")
nrow(purchase)

[1] "Minimum qty"


[1] "Maximum Quantity"


[1] "Total Order Cost"


[1] "Average Number of A/P Months"


[1] "Number of Purchase Orders"


## Up to you 5: Flights Data Analysis

### Load Data

In [16]:
library(nycflights13) #now you can access "flights"

"package 'nycflights13' was built under R version 3.4.4"

### Find the following flights:
* To SFO or OAK
* In January
* Delayed by more than an hour
* That departed between midnight and five am
* Where the arrival delay was more than twice the departure delay


In [21]:
flights %>%
filter(dest %in% c("SFO", "OAK")) %>% head()
flights %>%
filter(month==1) %>% head()
flights %>%
filter(arr_delay>60) %>% head()
flights %>%
filter(dep_time<5) %>% head()
flights %>%
filter(arr_delay > 2 * dep_delay) %>% head() 

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
2013,1,1,558,600,-2,923,937,-14,UA,1124,N53441,EWR,SFO,361,2565,6,0,2013-01-01 06:00:00
2013,1,1,611,600,11,945,931,14,UA,303,N532UA,JFK,SFO,366,2586,6,0,2013-01-01 06:00:00
2013,1,1,655,700,-5,1037,1045,-8,DL,1865,N705TW,JFK,SFO,362,2586,7,0,2013-01-01 07:00:00
2013,1,1,729,730,-1,1049,1115,-26,VX,11,N635VA,JFK,SFO,356,2586,7,30,2013-01-01 07:00:00
2013,1,1,734,737,-3,1047,1113,-26,B6,643,N625JB,JFK,SFO,350,2586,7,37,2013-01-01 07:00:00
2013,1,1,745,745,0,1135,1125,10,AA,59,N336AA,JFK,SFO,378,2586,7,45,2013-01-01 07:00:00


year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
2013,1,1,517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
2013,1,1,533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
2013,1,1,542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
2013,1,1,544,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00
2013,1,1,554,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00
2013,1,1,554,558,-4,740,728,12,UA,1696,N39463,EWR,ORD,150,719,5,58,2013-01-01 05:00:00


year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
2013,1,1,811,630,101,1047,830,137,MQ,4576,N531MQ,LGA,CLT,118,544,6,30,2013-01-01 06:00:00
2013,1,1,848,1835,853,1001,1950,851,MQ,3944,N942MQ,JFK,BWI,41,184,18,35,2013-01-01 18:00:00
2013,1,1,957,733,144,1056,853,123,UA,856,N534UA,EWR,BOS,37,200,7,33,2013-01-01 07:00:00
2013,1,1,1114,900,134,1447,1222,145,UA,1086,N76502,LGA,IAH,248,1416,9,0,2013-01-01 09:00:00
2013,1,1,1120,944,96,1331,1213,78,EV,4495,N16561,EWR,SAV,117,708,9,44,2013-01-01 09:00:00
2013,1,1,1255,1200,55,1451,1330,81,MQ,4601,N518MQ,LGA,BNA,139,764,12,0,2013-01-01 12:00:00


year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
2013,1,9,2,2359,3,432,444,-12,B6,739,N603JB,JFK,PSE,193,1617,23,59,2013-01-09 23:00:00
2013,1,10,3,2359,4,426,437,-11,B6,727,N571JB,JFK,BQN,183,1576,23,59,2013-01-10 23:00:00
2013,1,13,1,2249,72,108,2357,71,B6,22,N206JB,JFK,SYR,41,209,22,49,2013-01-13 22:00:00
2013,1,13,2,2359,3,502,444,18,B6,739,N523JB,JFK,PSE,197,1617,23,59,2013-01-13 23:00:00
2013,1,13,3,2030,213,340,2350,230,B6,1069,N281JB,JFK,AUS,243,1521,20,30,2013-01-13 20:00:00
2013,1,16,2,2125,157,119,2250,149,MQ,4660,N504MQ,LGA,BNA,121,764,21,25,2013-01-16 21:00:00


year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
2013,1,1,517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
2013,1,1,533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
2013,1,1,542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
2013,1,1,554,558,-4,740,728,12,UA,1696,N39463,EWR,ORD,150,719,5,58,2013-01-01 05:00:00
2013,1,1,555,600,-5,913,854,19,B6,507,N516JB,EWR,FLL,158,1065,6,0,2013-01-01 06:00:00
2013,1,1,558,600,-2,753,745,8,AA,301,N3ALAA,LGA,ORD,138,733,6,0,2013-01-01 06:00:00


### Order the flights by departure date and time.
* Which flights were most delayed?
* Which flights caught up the most time during the flight?

In [26]:
flights %>% arrange(-arr_delay) %>% head()

flights %>% mutate(catchUp = dep_delay - arr_delay) %>% arrange(-catchUp) %>% select(flight, dep_delay, arr_delay, catchUp) %>% head()

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
2013,1,9,641,900,1301,1242,1530,1272,HA,51,N384HA,JFK,HNL,640,4983,9,0,2013-01-09 09:00:00
2013,6,15,1432,1935,1137,1607,2120,1127,MQ,3535,N504MQ,JFK,CMH,74,483,19,35,2013-06-15 19:00:00
2013,1,10,1121,1635,1126,1239,1810,1109,MQ,3695,N517MQ,EWR,ORD,111,719,16,35,2013-01-10 16:00:00
2013,9,20,1139,1845,1014,1457,2210,1007,AA,177,N338AA,JFK,SFO,354,2586,18,45,2013-09-20 18:00:00
2013,7,22,845,1600,1005,1044,1815,989,MQ,3075,N665MQ,JFK,CVG,96,589,16,0,2013-07-22 16:00:00
2013,4,10,1100,1900,960,1342,2211,931,DL,2391,N959DL,JFK,TPA,139,1005,19,0,2013-04-10 19:00:00


flight,dep_delay,arr_delay,catchUp
4377,235,126,109
51,60,-27,87
51,206,126,80
1465,17,-62,79
51,24,-52,76
673,48,-26,74


### Create data pipelines to answer the following questions:

* Which destinations have the highest average delays?
* Which flights (i.e. carrier + flight) happen every day? Where do they fly to?
* Identify the most punctual airlines and the most punctual trips (origin – destination combinations). (Narrow down to where # occurrences > 30)


In [32]:
flights %>%
    na.omit() %>%
    group_by(dest) %>%
    summarize(meanDelay = mean(arr_delay)) %>%
    arrange(-meanDelay) %>%
    head()

dest,meanDelay
CAE,41.76415
TUL,33.65986
OKC,30.61905
JAC,28.09524
TYS,24.0692
MSN,20.19604
