### Setting up

In [None]:
import os
dir_root = os.getcwd() + '/'
import findspark
findspark.init()

from pyspark.sql import SparkSession
import numpy as np

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
ddf = spark.createDataFrame([[np.nan, 'John'],
                             [None, 'Michael'],
                             [30., 'Andy'],
                             [19., 'Justin'],
                             [30., 'James Dr No From Russia with Love Bond']], 
                             schema = ['age', 'name'])

## Intermezzo: laziness in Spark
- Transformations (lazy, Catalyst)
    - filter
    - select
    - join
    - etc. (most)


- Actions (actual computation)
    - count
    - show
    - head


Quick question: what would be a good moment to cache?

## 3. Functions
- lots of functions (too many)
- know the fundamentals
- API Docs: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html

In [None]:
from pyspark.sql import functions as sf

### 3.1 when -> otherwise
2 ways of being Andy

In [None]:
(ddf
 .withColumn('is_andy', sf.col('name') == 'Andy')
 .withColumn('is_andy2', sf.when(sf.col('name') == 'Andy', True)
                           .otherwise(False))
 .show())

In [None]:
(ddf
 .withColumn('whos_this', sf.when(sf.col('name') == 'Andy', 'Yup, Andy')
                            .when(sf.col('name') == 'Justin', 'Justin here'))
 .show())

### 3.2 isin()
2 ways of being Andy or Justin

In [None]:
(ddf
 .withColumn('is_andy_or_justin', (sf.col('name') == 'Andy') |
                                  (sf.col('name') == 'Justin'))
 .withColumn('is_andy_or_justin2', sf.col('name').isin('Andy', 'Justin'))
 .show())

In [None]:
ddf = ddf.withColumn('is_teen', sf.col('age').isin(list(range(20))))
ddf.show()

### 3.4 lit()

In [None]:
(ddf.withColumn('5', sf.lit(5))
    .show())

### 3.5 ~ (negation)

In [None]:
(ddf.withColumn('aint_no_teen', ~sf.col('is_teen'))
    .show())

### Intermezzo: raw SQL

In [None]:
ddf.registerTempTable('ddf')
(spark
 .sql("SELECT age, count(*) FROM ddf GROUP BY age")
 .show())

### 3.6 join()

In [None]:
ddf1 = spark.createDataFrame([[1], [2]], schema=['a'])
ddf1.show()

In [None]:
ddf2 = spark.createDataFrame([[2], [3]], schema=['a'])
ddf2.show()

In [None]:
ddf1.join(ddf2, on = ['a'], how = 'inner').show()

In [None]:
ddf1.join(ddf2, on = ddf1.a == ddf2.a).show()

### 3.7 isNull() / isNotNull() and isnan()
Other very useful functions are `isNull()` and `isNotNull()`. They're used like this

In [None]:
(ddf.withColumn('imputed_age', sf.when(sf.col('age').isNull(), 40)
                                 .otherwise(sf.col('age')))
    .show())

In [None]:
(ddf.withColumn('imputed_age', sf.when(sf.isnan('age'), 40)
                                 .otherwise(sf.col('age')))
    .show())

### 3.8 fillna()
- fills both null and NaN
- fills only 1 value

In [None]:
(ddf
 .fillna(40, subset='age')
 .show()) 

### 3.9 dropna()
- drops both null and NaN

In [None]:
(ddf
 .groupBy('age')
 .count()
 .dropna(subset = 'age')
 .show())

### 3.10 sample()
- possible to take subset of data toPandas


In [None]:
ddf_air = spark.read.load(dir_root + 'data/airlines.parquet')

In [None]:
ddf_air.show()

In [None]:
(ddf_air.sample(False, fraction=0.0002)
        .select('year', 'month')
        .show())

### 3.11 distinct() / countDistinct()

In [None]:
(ddf.distinct()
    .show())

In [None]:
(ddf.agg(sf.countDistinct('age').alias('distinct_ages'))
    .show())

### 3.12 User defined functions (UDF)
- executed in RDD-land
- avoid where possible

In [None]:
from pyspark.sql.types import IntegerType
slen = sf.udf(lambda s: len(s), IntegerType())

(ddf.withColumn('name_length', slen(ddf.name))
    .show())

### *Exercise*

1. Explore the `ddf_air` DF, and count how many NaN's you have in each column;
2. Fill the NaN with something that makes sense for each column.
3. With a UDF, capture the state in the `airport_name` column (e.g. 'NY' in 'New York, NY: John F. Kennedy International') and
4. make a new dataframe `ddf_states` with columns `airport, state`
3. Remove duplicates from ddf_states (hint: lookup `drop_duplicates()` in the docs)
3. Join `ddf_states` onto the original `ddf_air` 
7. add a column weather_condition that is 
```
'rainy' if the `weather_delay` is greather than 1200
'stormy' if in addition to this the arrival is diverted by more than 15 minutes
'bright' otherwise
```
6. Split the DF into a train and test set sorted by time cols (hint: lookup `limit()` or `randomSplit()` in the docs)

In [None]:
ddf_air = spark.read.load(dir_root + 'data/airlines.parquet')

Columns mean:

* `arr_flights`: flights arrived
* `arr_del15`: flights delayed more than 15';
* `carrier_ct`: delayed by carrier;
* `weather_ct`: by weather;
* `nas_ct`: by national aviation system;
* `security_ct`: by security;
* `late_aircraft_ct`: by late aircraft arrival;
* `arr_cancelled`: cancelled;
* `arr_diverted`: deverted;
* `arr_delay`: total delay and then breakdown below;
* `carrier_delay`;
* `weather_delay`;
* `nas_delay`;
* `security_delay`;
* `late_aircraft_delay`.