### Setting up

In [None]:
import os
dir_root = os.getcwd() + '/'
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import functions as sf

# DataFrames

## Why Spark dataframes?

- __Schema (strongly typed)__

- __Optimizations via Catalyst__
    - like SQL query planner + physical plan
    - more declarative
    - e.g. filter push down


* __SPEED__

<img src="images/dataframe-api-speed.png" width="80%" />

- __Familiar to SQL users__
    - similar expressions

- __Simpler code__

*RDDs*
```python
(rdd.map(lambda x: (x[0], (x[1], 1)))
    .reduceByKey(sum_pair)
    .mapValues(lambda s: s[0] / s[1]))
```

Where

```python    
def sum_pair(pair):
    x, y = pair
    return (x[0] + y[0], x[1] + y[1])
```

*DataFrame API*
```python
(ddf.groupBy('name')
    .agg({'age': 'avg'})
    .collect())
```

## 1. Getting data

### 1.1 Create

#### From rdd to dataframe



In [None]:
rdd = sc.parallelize([[None, 'Michael'],
                      [30, 'Andy'],
                      [19, 'Justin'],
                      [30, 'James Dr No From Russia with Love Bond']])
rdd.collect()

In [None]:
rdd.toDF()

In [None]:
ddf = rdd.toDF() 
ddf.show()

#### Directly from python

In [None]:
ddf_names = spark.createDataFrame([[None, 'Michael'],
                             [30, 'Andy'],
                             [19, 'Justin'],
                             [30, 'James Dr No From Russia with Love Bond']], 
                            schema = ['age', 'name'])
ddf_names.show()

In [None]:
ddf_names.columns

In [None]:
ddf_names.dtypes

In [None]:
ddf_names.schema

In [None]:
ddf_names.printSchema()

#### Still an rdd

In [None]:
ddf_names.first()

In [None]:
ddf_names.rdd.map(lambda r: r['age'] + 1 if r['age'] != None else r['age']).collect()

#### If your name is too long

Mr. David Feirn

```James Dr No From Russia with Love Goldfinger Thunderball You Only Live Twice On Her Majestys Secret Service Diamonds Are Forever Live and Let Die The Man with the Golden Gun The Spy Who Loved Me Moonraker For Your Eyes Only Octopussy A View to a Kill The Living Daylights Licence to Kill Golden Eye Tomorrow Never Dies The World Is Not Enough Die Another Day Casino Royale Bond```

In [None]:
ddf_names.show()

In [None]:
ddf_names.show(n=10, truncate=False)

### 1.2 Importing data

In [None]:
local_prepend = 'file://'
hdfs_prepend = 'hdfs://'

#### CSV

In [None]:
spark.read.csv(dir_root + 'data/heroes.csv').show(6)

In [None]:
ddf_heroes = spark.read.csv(dir_root + 'data/heroes.csv', header=True)
ddf_heroes.show(20)

In [None]:
ddf_heroes.columns

In [None]:
[c for c in ddf_heroes.columns if c != '_c0']

In [None]:
ddf_heroes.select([c for c in ddf_heroes.columns if c != '_c0'])

#### Pandas

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(dir_root + 'data/heroes.csv', index_col = 0)

In [None]:
ddf = spark.createDataFrame(df)
ddf.show(3)

In [None]:
ddf.toPandas().head(3)

#### Parquet
- preferred format
    - small file size (efficient compression)
    - schema
    - works accross machines (unlike pickle)
   

In [None]:
ddf_airlines = spark.read.parquet(dir_root + 'data/airlines.parquet')
ddf_airlines.show(2)

#### Other formats

* Jdbc;
* HDFS;
* Avro*;
* HBase*;
* Cassandra*;
* etc.

${}^*$ external

## 2. Basics (do stuff)
- `filter`
- `select`
- `sort`
- `groupBy`

### 2.1 select()
- select columns
- make new columns

#### Selecting existing cols

In [None]:
from pyspark.sql import functions as sf # Spark functions

In [None]:
(ddf_names.select('name', # if possible
            ddf_names.name, # no
            ddf_names['name'], # no
            sf.col('name')) # all other cases
    .show())

#### Create new columns

In [None]:
ddf_names.age

In [None]:
ddf_names['age']

In [None]:
(ddf_names.select('*',
            ddf_names['age'] + 1,
            sf.sqrt('age')) # apply Spark function
    .show())

In [None]:
ddf_names2 = ddf_names.select(ddf_names.age + 1)
ddf_names2.columns

In [None]:
(ddf_names2.select('(age + 1)')
     .show())

#### Hive

In [None]:
ddf_names.registerTempTable('ddf_names')

In [None]:
ddf_names_sql = spark.sql("""
SELECT age FROM ddf_names
""").show()

#### Naming new columns

In [None]:
(ddf_names.select((ddf_names.age + 1).alias('age_inc')) 
    .show())

In [None]:
ddf_names.select('*', 'age')

In [None]:
(ddf_names.withColumn('age_inc', ddf_names.age + 1))

#### *Intermezzo: query formatting*

Pythonista's
```python
(
    ddf.groupBy('name')
    .agg({'age': 'avg'})
    .collect()
)
```

Some others
```python
(ddf.groupBy('name')
    .agg({'age': 'avg'})
    .collect())
```

Another alternative
```python
(ddf
 .groupBy('name')
 .agg({'age': 'avg'})
 .collect())
```

### 2.2 sort()

In [None]:
(ddf_names
 .sort('age')
 .show())

In [None]:
(ddf_names
 .sort(sf.col('age').desc())
 .show())

### 2.3 filter()

In [None]:
(ddf_names
 .filter(sf.col('age') > 21)
 .show())

In [None]:
(ddf_names
 .filter((sf.col('age') > 21) &
         (sf.col('name') != 'Andy'))
 .show())

### 2.4 groupBy() -> agg() (aggregate)

In [None]:
ddf_names.show()

In [None]:
(ddf_names
 .groupBy("age")
 .count()
 .show())

In [None]:
(ddf_names
 .groupBy("age")
 .agg({'age': 'max', 'age': 'first', 'age': 'stddev'}) # why does this happen?
 .show())

In [None]:
{'age': 'max', 'age': 'first', 'age': 'stddev'}

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [None]:
schema = StructType([StructField('age', StringType()), StructField('name', StringType())])

In [None]:
ddf_names_str = spark.createDataFrame(ddf_names.rdd, schema = schema)
ddf_names_str.printSchema()

In [None]:
aggregate_fct = [sf.first(c).alias(c + '_first_agg') for c in ['age', 'name']]
aggregate_fct

In [None]:
(ddf_names_str
 .groupBy("age")
 .agg(sf.max('age').alias('max_age'),
      sf.first('age').alias('first_age'),
      sf.stddev('age').alias('stddev_age'),
      *aggregate_fct)
 .show())

In [None]:
ddf_names_str.withColumn('age_int',sf.col('age').cast('Int'))

### *Exercises*
1. Load the Heroes of the Storm dataset with read csv ('data/heroes.csv' in the hdfs homedir of user centos)
2. check the dtypes: 
    - what do you notice?
    - Fix it.
2. Manually explore the data and look for corrupted/malformed data
3. Find a way to remove these corrupted rows
3. Which hero has the most hp?
4. Add a column with the 'attack_momentum', computed as attack * attack_spd
5. Which role on average has the highest attack?
6. Figure out which roles and attack_type frequently co-occur
7. Deliver a dataframe with the highest attack per role 
8. export to Pandas

Bonus
9. make a function that accepts a dataframe and a list colnames. Let it return the mean and stddev of the columns 
10. apply the function to the hp and attack column such that the result has columns:

`hp_mean, hp_stddev, attack_mean, attack_stddev`