In [None]:
getwd()
rm(list = ls())

## Setup libraries

In [None]:
setupLibrary <- function(libraryName){
  if (!require(libraryName, character.only = TRUE)){
    install.packages(libraryName, dep = TRUE)
    if (!require(libraryName, character.only = TRUE)){
      print('Package not found')
    }
  } else {
    print('Package is loaded')
  }
}

setupLibrary('sparklyr')
setupLibrary('dplyr')
setupLibrary('ggplot2')

In [None]:
sc <- spark_connect(master = 'yarn', 
                    config = list('spark.driver.memory'='8G',
                                  'spark.executor.instances'=4,
                                  'spark.executor.cores'=8,
                                  'spark.executor.memory'='8G')
                    )

## Airline Traffic Delay



In [None]:
airline_tbl <- spark_read_csv(sc, name = 'airline_data',
                              path = '/repository/airlines/data/',
                              delimiter = ',')

airline_tbl

In [None]:
carrier_tbl <- spark_read_csv(sc, name = 'carrier_data',
                               path = '/repository/airlines/metadata/carriers.csv',
                               delimiter = ',')
carrier_tbl

## Machine Learning/Statistical Functions available on Big Data through Spark

- [ml_kmeans](https://spark.rstudio.com/reference/ml_kmeans.html)
- [ml_linear_regression](https://spark.rstudio.com/reference/ml_linear_regression/)
- [ml_logistic_regression](https://spark.rstudio.com/reference/ml_logistic_regression.html)
- [ml_survival_regression](https://spark.rstudio.com/reference/ml_aft_survival_regression.html)
- [ml_generalized_linear_regression](https://spark.rstudio.com/reference/ml_generalized_linear_regression.html)
- [ml_decision_tree](https://spark.rstudio.com/reference/ml_decision_tree.html)
- [ml_random_forest](https://spark.rstudio.com/reference/ml_random_forest.html)
- [ml_pca](https://spark.rstudio.com/reference/ft_pca.html)
- [ml_naive_bayes](https://spark.rstudio.com/reference/ml_naive_bayes.html)
- [ml_multilayer_perceptron](https://spark.rstudio.com/reference/ml_multilayer_perceptron_classifier.html)
- [ml_lda](https://spark.rstudio.com/reference/ml_lda.html)
- [ml_one_vs_rest](https://spark.rstudio.com/reference/ml_one_vs_rest.html)

**Example: Do planes with a delayed departure fly with a faster average speed to make up for the delay?**

We start with a simple regression analysis for average speed and departure delay

In [None]:
# Filter records and create target variable 'gain'
speed_data <- airline_tbl %>%
  mutate(Dep_delay = as.numeric(DepDelay)) %>%
  mutate(Dep_delay = ifelse(Dep_delay < 0, 0, Dep_delay)) %>%
  mutate(Flight_Distance = as.numeric(Distance)) %>%
  mutate(Air_time = as.numeric(AirTime)) %>%
  filter(!is.na(Dep_delay) & !is.na(Flight_Distance) & !is.na(Air_time)) %>%
  filter(Dep_delay > 15 & Dep_delay < 240) %>%
  mutate(Avg_speed = Flight_Distance / Air_time) %>%
  select(Dep_delay, Avg_speed)

speed_data

In [None]:
sdf_dim(speed_data)

Running a simple linear regression: [ml_linear_regression](http://spark.rstudio.com/reference/ml_linear_regression/)

In [None]:
speed_model <- speed_data %>%
  ml_linear_regression(Dep_delay ~ Avg_speed, max_iter = 100)

In [None]:
summary(speed_model)

** Example: Predict time gained in flight by airline carriers (From [Nathan Stephens' repository](https://gist.github.com/nwstephens/9d5bc92412b4eb0288662a4da4f62c6c)) **

In [None]:
# Filter records and create target variable 'gain'
model_data <- airline_tbl %>%
  mutate(Arr_delay = as.numeric(ArrDelay)) %>%
  mutate(Dep_delay = as.numeric(DepDelay)) %>%
  mutate(Flight_Distance = as.numeric(Distance)) %>%
  filter(!is.na(Arr_delay) & !is.na(Dep_delay) & !is.na(Flight_Distance)) %>%
  filter(year != 2008) %>%
  filter(Dep_delay > 15 & Dep_delay < 240) %>%
  filter(Arr_delay > -60 & Arr_delay < 360) %>%
  left_join(carrier_tbl, by = c("UniqueCarrier" = "Code")) %>%
  mutate(Gain = Dep_delay - Arr_delay) %>%
  select(Year, Month, Arr_delay, Dep_delay, Flight_Distance, UniqueCarrier, Description, Gain)

model_data

In [None]:
sdf_dim(model_data)

In [None]:
# Summarize data by carrier
model_data %>%
  group_by(UniqueCarrier) %>%
  summarize(min_desc = min(Description), avg_gain=mean(Gain), 
            avg_distance = mean(Flight_Distance), avg_depdelay=mean(Dep_delay)) %>%
  select(min_desc, avg_gain, avg_distance, avg_depdelay) %>%
  arrange(avg_gain)

In [None]:
# Partition the data into training and validation sets
model_partition <- model_data %>% 
  sdf_partition(train = 0.7, valid = 0.3, seed = 5555)

model_partition

In [None]:
# Fit a linear model
ml1 <- model_partition$train %>%
  ml_linear_regression(Gain ~ Flight_Distance + Dep_delay + UniqueCarrier)

In [None]:
# Summarize the linear model
summary(ml1)

In [None]:
Assess model performance

In [None]:
# Filter records and create target variable 'gain'
data_2008 <- airline_tbl %>%
  mutate(Arr_delay = as.numeric(ArrDelay)) %>%
  mutate(Dep_delay = as.numeric(DepDelay)) %>%
  mutate(Flight_Distance = as.numeric(Distance)) %>%
  filter(!is.na(Arr_delay) & !is.na(Dep_delay) & !is.na(Flight_Distance)) %>%
  filter(year == 2008) %>%
  filter(Dep_delay > 15 & Dep_delay < 240) %>%
  filter(Arr_delay > -60 & Arr_delay < 360) %>%
  left_join(carrier_tbl, by = c("UniqueCarrier" = "Code")) %>%
  mutate(Gain = Dep_delay - Arr_delay) %>%
  select(Year, Month, Arr_delay, Dep_delay, Flight_Distance, UniqueCarrier, Description, Gain)

data_2008
sdf_dim(data_2008)

In [None]:
# Summarize data by carrier
carrier <- sdf_predict(ml1, data_2008) %>%
  group_by(description) %>%
  summarize(gain = mean(Gain), prediction = mean(prediction), freq = n()) %>%
  filter(freq > 10000) %>%
  collect

In [None]:
# Plot actual gains and predicted gains by airline carrier
ggplot(carrier, aes(gain, prediction)) + 
  geom_point(alpha = 0.75, color = 'red', shape = 3) +
  geom_abline(intercept = 0, slope = 1, alpha = 0.15, color = 'blue') +
  geom_text(aes(label = substr(description, 1, 20)), size = 3, alpha = 0.75, vjust = -1) +
  labs(title='Average Gains Forecast', x = 'Actual', y = 'Predicted')

**Challenge: **

In his note, Nathan Stephens makes a conclusion that the best predictor of time gained is not carrier but flight distance. Test this conclusion by filtering the original data to keep only the flights whose distances are in the top 50%. You can try using a different regression model, such as logistic_regression or glm

In [None]:
spark_disconnect(sc)