## Where am I?

In [None]:
getwd()

## Clear global environment

In [None]:
rm(list = ls())

## Setup *sparklyr*

In [None]:
setupLibrary <- function(libraryName){
  if (!require(libraryName, character.only = TRUE)){
    install.packages(libraryName, dep = TRUE)
    if (!require(libraryName, character.only = TRUE)){
      print('Package not found')
    }
  } else {
    print('Package is loaded')
  }
}

setupLibrary('sparklyr')
setupLibrary('dplyr')

In [None]:
sc <- spark_connect(master = 'yarn', 
                    config = list('spark.driver.memory'='8G',
                                  'spark.executor.instances'=4,
                                  'spark.executor.cores'=8,
                                  'spark.executor.memory'='8G',
                                  'spark.executorEnv.PATH'=paste(Sys.getenv("PATH"),
                                       "/software/anaconda3/5.0.1/bin/",sep=":"))
                    )

### Writing custom R code using [`spark_apply()`](http://spark.rstudio.com/reference/spark_apply/)

- `spark_apply()` applies an R function to a Spark object
- The application can be done on the default partitions of the object in the cluster or via `group_by` argument
- The R function must return another Spark DataFrame
- The outcomes of the application will be aggregated into a single Spark DataFrame

## Load data

In [None]:
test_df <- sdf_len(sc, 10, repartition = 10) 
index_df <- test_df %>%
  spark_apply(function(e) I(e))

In [None]:
sdf_num_partitions(test_df)
sdf_num_partitions(index_df)

In [None]:
airline_tbl <- spark_read_csv(sc, name = 'airline_data',
                              path = '/repository/airlines/data/',
                              delimiter = ',')
airline_tbl

In [None]:
sdf_num_partitions(airline_tbl)

In [None]:
carrier_tbl <- airline_tbl %>%
  select(UniqueCarrier)

carrier_tbl

In [None]:
carrier_tbl <- airline_tbl %>%
  select(UniqueCarrier)

carrier_tbl %>% spark_apply(nrow, group_by = 'UniqueCarrier', columns = list(Unique_Carrier = 'character'))

Count number of flights per carrier

In [None]:
spark_disconnect(sc)

### Accessing Spark native API

- We use R to sends commands to interact with the Spark cluster via its Java API for data manipulation and analytical tasks. **Think remote surgery!**

- [Basic functionalites](https://spark.rstudio.com/reference/invoke/): `invoke`, `invoke_new`, `invoke_static`

In [None]:
setupLibrary('sparklyr')
count_lines <- function(sc, file) {
  spark_context(sc) %>% 
    invoke("textFile", file, 1L) %>% 
    invoke("count")
}

In [None]:
sc <- spark_connect(master = 'yarn', 
                    config = list('spark.driver.memory'='8G',
                                  'spark.executor.instances'=4,
                                  'spark.executor.cores'=8,
                                  'spark.executor.memory'='8G')
                    )

In [None]:
count_lines(sc, '/repository/airlines/data/')

In [None]:
billionBigInteger <- invoke_new(sc, "java.math.BigInteger", "1000000000")
billionBigInteger

In [None]:
top_movies <- avg_ratings %>%
  inner_join(info_tlb, by = 'movieId') %>%
  select(title, count, avg_rating) %>%
  collectd

In [None]:
top_movies

In [None]:
billion <- invoke(billionBigInteger, "longValue")
billion

In [None]:
sc %>% 
  invoke_static("java.lang.Math", "hypot", 10, 20) 

In [None]:
sc %>% 
  invoke_static("java.lang.Math", "sqrt", 4) 

In [None]:
spark_disconnect(sc)