# Spark API Mini Exercises

In [28]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *

import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

np.random.seed(13)

#### 1. Spark Dataframe Basics

i. Use the starter code below to create a pandas dataframe (just run the cell):

In [6]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

ii. Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [14]:
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



iii. Show the first 3 rows of the dataframe.

In [13]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



iv. Show the first 7 rows of the dataframe.

In [15]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



v. View a summary of the data using `.describe()`.
> Note that `.describe` returns another dataframe, so we still have to do `.show()` at the end.

In [19]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885216| null|
| stddev|0.8905322898155364| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



vi. Use `.select()` to create a new dataframe with just the `n` and `abool` columns. View the first 5 rows of this dataframe.

In [20]:
df.select(df.n, df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



vii. Use `.select()` to create a new dataframe with just the `group` and `abool` columns. View the first 5 rows of this dataframe.

In [21]:
df.select(df.group, df.abool).show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



viii. Use `.select()` to create a new dataframe with the `group` column and the `abool` column renamed to `a_boolean_value`. Show the first 3 rows of this dataframe.

In [22]:
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



ix. Use `.select()` to create a new dataframe with the `group` column and the `n` column renamed to `a_numeric_value`. Show the first 6 rows of this dataframe.

In [23]:
df.select(df.group, df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



#### 2. Column Manipulation

i. Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a variable named `df`

In [26]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-0.8850620992868307|    x|false|
|0.07272674611277782|    x| true|
|  -0.82751910119974|    x|false|
| -0.591550921883219|    y|false|
| -2.186215625579764|    y| true|
+-------------------+-----+-----+
only showing top 5 rows



ii. Use `select()` to add 4 to the `n` column. Show the results.

In [38]:
(df.select(expr('n + 4 AS n_plus_4'))
).show(5)

+------------------+
|          n_plus_4|
+------------------+
|3.1149379007131692|
| 4.072726746112778|
|  3.17248089880026|
| 3.408449078116781|
|1.8137843744202362|
+------------------+
only showing top 5 rows



iii. Subtract 5 from the `n` column and view the results.

In [39]:
(df.select(expr('n -5 AS n_minus_5'))
).show(5)

+------------------+
|         n_minus_5|
+------------------+
|-5.885062099286831|
|-4.927273253887222|
| -5.82751910119974|
|-5.591550921883219|
|-7.186215625579764|
+------------------+
only showing top 5 rows



iv. Multiply the `n` column by 2. View the results along with the original numbers.

In [40]:
(df.select(expr('n * 2 AS n_times_2'))
).show(5)

+-------------------+
|          n_times_2|
+-------------------+
|-1.7701241985736613|
|0.14545349222555565|
|  -1.65503820239948|
| -1.183101843766438|
| -4.372431251159528|
+-------------------+
only showing top 5 rows



v. Add a new column named `n2` that is the `n` value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original `n` value as well as `n2`.

In [41]:
(df.select(expr('n'),
           expr('n * -1 AS n2'))
).show(4)

+-------------------+--------------------+
|                  n|                  n2|
+-------------------+--------------------+
|-0.8850620992868307|  0.8850620992868307|
|0.07272674611277782|-0.07272674611277782|
|  -0.82751910119974|    0.82751910119974|
| -0.591550921883219|   0.591550921883219|
+-------------------+--------------------+
only showing top 4 rows



vi. Add a new column named `n3` that is the `n` value squared. Show the first 5 rows of your dataframe. You should see both `n`, `n2`, and `n3`.

In [45]:
(df.select(expr('n'),
           expr('n * -1 AS n2'),
           expr('pow(n, 2) AS n3'))
).show(5)

+-------------------+--------------------+--------------------+
|                  n|                  n2|                  n3|
+-------------------+--------------------+--------------------+
|-0.8850620992868307|  0.8850620992868307|  0.7833349195940117|
|0.07272674611277782|-0.07272674611277782|0.005289179600152444|
|  -0.82751910119974|    0.82751910119974|  0.6847878628504256|
| -0.591550921883219|   0.591550921883219| 0.34993249318088626|
| -2.186215625579764|   2.186215625579764|   4.779538761529118|
+-------------------+--------------------+--------------------+
only showing top 5 rows



vii. What happens when you run the code below?

In [46]:
df.group + df.abool

Column<'(group + abool)'>

**A**: A Column object is produced that represents the transformation of adding together the `group` and `abool` columns.

viii. What happens when you run the code below? What is the difference between this and the previous code sample?

In [48]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(group AS DOUBLE) + abool)' due to data type mismatch: differing types in '(CAST(group AS DOUBLE) + abool)' (double and boolean).;
'Project [unresolvedalias((cast(group#739 as double) + abool#740), Some(org.apache.spark.sql.Column$$Lambda$3381/0x00000008013b4040@6628ea39))]
+- LogicalRDD [n#738, group#739, abool#740], false


An error is produced referencing the incompatible types. Unlike the previous code sample, this one is done within the context of a `.select`, so even though there are still no values produced (we haven't invoked an action yet), spark is aware that the types are incompatible.

ix. Try adding various other columns together. What are the results of combining the different data types?

In [49]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [50]:
df.n + df.group

Column<'(n + group)'>

In [51]:
df.n + df.abool

Column<'(n + abool)'>

#### 3. Type Casting

i. Use the starter code above to re-create a spark dataframe named `df`.

In [63]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-0.8850620992868307|    x|false|
|0.07272674611277782|    x| true|
|  -0.82751910119974|    x|false|
| -0.591550921883219|    y|false|
| -2.186215625579764|    y| true|
+-------------------+-----+-----+
only showing top 5 rows



ii. Use `.printSchema()` to view the datatypes in your dataframe.

In [64]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



iii. Use `.dtypes` to view the datatypes in your dataframe.

In [65]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

iv. What is the difference between the two code samples below?

In [66]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [67]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    1|
|    0|
|    0|
|    1|
|    1|
|    1|
|    1|
|    0|
|    1|
|    0|
|    1|
|    1|
|    0|
|    1|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



**A:** One is a creating a Column and one is using that same column in a `.select()` in order to view the results of the cast.

v. Use `.select()` and `.cast()` to convert the abool column to an integer type. View the results.

In [68]:
df.select(df.abool.cast('int')).show(5)

+-----+
|abool|
+-----+
|    0|
|    1|
|    0|
|    0|
|    1|
+-----+
only showing top 5 rows



vi. Convert the `group` column to a integer data type and view the results. What happens?

In [69]:
df.select(df.group.cast('int')).show(5)

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 5 rows



vii. Convert the `n` column to a integer data type and view the results. What happens?

In [70]:
df.select(df.n.cast('int')).show(5)

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
| -2|
+---+
only showing top 5 rows



viii. Convert the `abool` column to a string data type and view the results. What happens?

In [71]:
df.select(df.abool.cast('string')).show(5)

+-----+
|abool|
+-----+
|false|
| true|
|false|
|false|
| true|
+-----+
only showing top 5 rows



#### 4. Built-in Functions

i. Use the starter code above to re-create a spark dataframe named `df`.

In [72]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7209996734353815|    z| true|
|  0.5622617060581323|    x|false|
| -0.3148580761325759|    z| true|
|-0.42989708186880404|    z| true|
|  0.8149954430712237|    x| true|
+--------------------+-----+-----+
only showing top 5 rows



ii. Import the necessary functions from `pyspark.sql.functions`

In [73]:
from pyspark.sql.functions import *

iii. Find the highest `n` value.

In [79]:
df.select(round(max('n'), 2).alias('max value of n')).show()

+--------------+
|max value of n|
+--------------+
|          1.75|
+--------------+



iv. Find the lowest `n` value.

In [85]:
df.select(round(min('n'), 2).alias('min value of n')).show()

+--------------+
|min value of n|
+--------------+
|         -2.19|
+--------------+



v. Find the average `n` value.

In [90]:
df.select(round(avg('n'), 2).alias('average value of n')).show()

+------------------+
|average value of n|
+------------------+
|               0.0|
+------------------+



vi. Use `concat()` to change the group column to say "Group: x" or "Group: y"

In [92]:
df.withColumnRenamed('group', 'group: x').show(5)

+--------------------+--------+-----+
|                   n|group: x|abool|
+--------------------+--------+-----+
| -0.7209996734353815|       z| true|
|  0.5622617060581323|       x|false|
| -0.3148580761325759|       z| true|
|-0.42989708186880404|       z| true|
|  0.8149954430712237|       x| true|
+--------------------+--------+-----+
only showing top 5 rows



vii. Use `concat()` to combine the `n` and `group` columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [102]:
(df
 .select(concat(df.group, 
                lit(': '), 
                round(df.n, 
                4))
                .alias('group: n')).show(5, truncate = False))

+----------+
|group: n  |
+----------+
|z: -0.721 |
|x: 0.5623 |
|z: -0.3149|
|z: -0.4299|
|x: 0.815  |
+----------+
only showing top 5 rows



#### 5. When / Otherwise

i. Use the starter code above to re-create a spark dataframe named `df`.

In [103]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  0.5345941466677115|    x| true|
|  0.8231227651701564|    x|false|
|  0.8862891040333676|    x|false|
| 0.17266171218872037|    z| true|
|-0.21312889268996238|    y|false|
+--------------------+-----+-----+
only showing top 5 rows



ii. Use `when()` and `.otherwise()` to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.

In [105]:
(df.select(df.abool,
           when(df.abool == 'true', 'It is true')
           .otherwise('It is false')
           .alias('Is it t/f?')).show(5))

+-----+-----------+
|abool| Is it t/f?|
+-----+-----------+
| true| It is true|
|false|It is false|
|false|It is false|
| true| It is true|
|false|It is false|
+-----+-----------+
only showing top 5 rows



iii. Create a column that contains 0 if n is less than 0, otherwise, the original `n` value.

In [107]:
(df.select(df.n,
           when(df.n < 0, 0)
           .otherwise(df.n)
           .alias('zero if n is less than 0')).show(5))

+--------------------+------------------------+
|                   n|zero if n is less than 0|
+--------------------+------------------------+
|  0.5345941466677115|      0.5345941466677115|
|  0.8231227651701564|      0.8231227651701564|
|  0.8862891040333676|      0.8862891040333676|
| 0.17266171218872037|     0.17266171218872037|
|-0.21312889268996238|                     0.0|
+--------------------+------------------------+
only showing top 5 rows



#### 6. Filter / Where

i. Use the starter code above to re-create a spark dataframe named `df`.

In [108]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.2636494824990419|    z| true|
|  1.5708990002014271|    y|false|
| -1.2615486912059428|    x|false|
|-0.07508917570719996|    y| true|
|  0.8501070283110412|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



ii. Use `.filter()` or `.where()` to select just the rows where the group is y and view the results.

In [114]:
(df.filter(df.group == 'y').show()
)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.5708990002014271|    y|false|
|-0.07508917570719996|    y| true|
| -0.8673479540502249|    y| true|
| -0.7749046025022026|    y|false|
|  1.2732503628768688|    y| true|
|  0.9105807193557125|    y| true|
| -0.5484703109458559|    y|false|
+--------------------+-----+-----+



In [112]:
(df.where(df.group == 'y').show()
)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.5708990002014271|    y|false|
|-0.07508917570719996|    y| true|
| -0.8673479540502249|    y| true|
| -0.7749046025022026|    y|false|
|  1.2732503628768688|    y| true|
|  0.9105807193557125|    y| true|
| -0.5484703109458559|    y|false|
+--------------------+-----+-----+



iii. Select just the columns where the `abool` column is false and view the results.

In [115]:
df.where(df.abool == 'false').show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 1.5708990002014271|    y|false|
|-1.2615486912059428|    x|false|
| 0.8501070283110412|    z|false|
| 0.7731666960765334|    z|false|
|-0.7749046025022026|    y|false|
| 1.0112322458862237|    x|false|
| 0.9416459862960309|    x|false|
|-0.7699411266400571|    z|false|
|-1.1303179237441572|    x|false|
| 0.9254890406588132|    x|false|
|-0.5484703109458559|    y|false|
|-0.9795224677574916|    z|false|
+-------------------+-----+-----+



iv. Find the columns where the group column is not y.

In [116]:
df.filter(df.group != 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.2636494824990419|    z| true|
| -1.2615486912059428|    x|false|
|  0.8501070283110412|    z|false|
| -0.9150316963547941|    x| true|
|  0.7731666960765334|    z|false|
|  1.0112322458862237|    x|false|
|  0.9416459862960309|    x|false|
| -0.7699411266400571|    z|false|
| -1.1303179237441572|    x|false|
|  0.9254890406588132|    x|false|
|-0.04059007575872...|    x| true|
| -1.6774958544832872|    z| true|
| -0.9795224677574916|    z|false|
+--------------------+-----+-----+



v. Find the columns where `n` is positive.

In [117]:
df.where(df.n > 0).show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.2636494824990419|    z| true|
|1.5708990002014271|    y|false|
|0.8501070283110412|    z|false|
|0.7731666960765334|    z|false|
|1.0112322458862237|    x|false|
|0.9416459862960309|    x|false|
|0.9254890406588132|    x|false|
|1.2732503628768688|    y| true|
|0.9105807193557125|    y| true|
+------------------+-----+-----+



vi. Find the columns where `abool` is true and the group column is z.

In [120]:
df.filter(df.abool == 'true').where(df.group == 'z').show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 1.2636494824990419|    z| true|
|-1.6774958544832872|    z| true|
+-------------------+-----+-----+



vii. Find the columns where `abool` is true or the `group` column is z.

In [124]:
df.where((df.abool == 'true')|(df.group == 'z')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.2636494824990419|    z| true|
|-0.07508917570719996|    y| true|
|  0.8501070283110412|    z|false|
| -0.9150316963547941|    x| true|
| -0.8673479540502249|    y| true|
|  0.7731666960765334|    z|false|
| -0.7699411266400571|    z|false|
|  1.2732503628768688|    y| true|
|-0.04059007575872...|    x| true|
|  0.9105807193557125|    y| true|
| -1.6774958544832872|    z| true|
| -0.9795224677574916|    z|false|
+--------------------+-----+-----+



viii. Find the columns where `abool` is false and `n` is less than 1

In [125]:
df.where(df.abool == 'false').filter(df.n < 1).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-1.2615486912059428|    x|false|
| 0.8501070283110412|    z|false|
| 0.7731666960765334|    z|false|
|-0.7749046025022026|    y|false|
| 0.9416459862960309|    x|false|
|-0.7699411266400571|    z|false|
|-1.1303179237441572|    x|false|
| 0.9254890406588132|    x|false|
|-0.5484703109458559|    y|false|
|-0.9795224677574916|    z|false|
+-------------------+-----+-----+



ix. Find the columns where `abool` is false or `n` is less than 1

In [126]:
df.where((df.abool == 'false')|(df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.5708990002014271|    y|false|
| -1.2615486912059428|    x|false|
|-0.07508917570719996|    y| true|
|  0.8501070283110412|    z|false|
| -0.9150316963547941|    x| true|
| -0.8673479540502249|    y| true|
|  0.7731666960765334|    z|false|
| -0.7749046025022026|    y|false|
|  1.0112322458862237|    x|false|
|  0.9416459862960309|    x|false|
| -0.7699411266400571|    z|false|
| -1.1303179237441572|    x|false|
|  0.9254890406588132|    x|false|
|-0.04059007575872...|    x| true|
|  0.9105807193557125|    y| true|
| -1.6774958544832872|    z| true|
| -0.5484703109458559|    y|false|
| -0.9795224677574916|    z|false|
+--------------------+-----+-----+



#### 7. Sorting

i. Use the starter code above to re-create a spark dataframe named `df`.

In [127]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.2690580976875576|    x| true|
|  0.2572490050399525|    z|false|
| -1.6642564360602479|    y|false|
| -0.1217464334245328|    y|false|
|-0.44151223119024857|    x| true|
+--------------------+-----+-----+
only showing top 5 rows



ii. Sort by the `n` value.

In [128]:
df.sort(df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.6642564360602479|    y|false|
| -1.5509569211976624|    x| true|
| -1.4141544662915098|    z|false|
| -1.2768778466596493|    x| true|
| -1.0669628403806435|    y|false|
| -0.6625811650314397|    x| true|
|-0.44151223119024857|    x| true|
| -0.3848490502028002|    y|false|
| -0.3051810275934747|    z| true|
| -0.1217464334245328|    y|false|
| 0.01997092245414585|    y| true|
|0.041447402165011596|    y| true|
| 0.12673002941390707|    x|false|
|  0.2572490050399525|    z|false|
|  0.5253295997343811|    z| true|
|  0.7435452461317569|    z| true|
|  0.8628168935353443|    z| true|
|  0.9552466924396553|    x|false|
|  1.2690580976875576|    x| true|
|  3.4011057189806557|    z| true|
+--------------------+-----+-----+



iii. Sort by the `group` value, both ascending and descending.

In [131]:
df.sort(asc(df.group)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.44151223119024857|    x| true|
|  1.2690580976875576|    x| true|
| 0.12673002941390707|    x|false|
|  0.9552466924396553|    x|false|
| -0.6625811650314397|    x| true|
| -1.5509569211976624|    x| true|
| -1.2768778466596493|    x| true|
| -1.0669628403806435|    y|false|
| -0.3848490502028002|    y|false|
| 0.01997092245414585|    y| true|
| -1.6642564360602479|    y|false|
| -0.1217464334245328|    y|false|
|0.041447402165011596|    y| true|
| -0.3051810275934747|    z| true|
|  3.4011057189806557|    z| true|
|  0.7435452461317569|    z| true|
|  0.2572490050399525|    z|false|
|  0.5253295997343811|    z| true|
|  0.8628168935353443|    z| true|
| -1.4141544662915098|    z|false|
+--------------------+-----+-----+



In [132]:
df.sort(desc(df.group)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.4141544662915098|    z|false|
| -0.3051810275934747|    z| true|
|  0.7435452461317569|    z| true|
|  3.4011057189806557|    z| true|
|  0.5253295997343811|    z| true|
|  0.8628168935353443|    z| true|
|  0.2572490050399525|    z|false|
| 0.01997092245414585|    y| true|
| -0.3848490502028002|    y|false|
| -1.6642564360602479|    y|false|
| -0.1217464334245328|    y|false|
| -1.0669628403806435|    y|false|
|0.041447402165011596|    y| true|
| 0.12673002941390707|    x|false|
|-0.44151223119024857|    x| true|
| -0.6625811650314397|    x| true|
| -1.2768778466596493|    x| true|
|  1.2690580976875576|    x| true|
|  0.9552466924396553|    x|false|
| -1.5509569211976624|    x| true|
+--------------------+-----+-----+



iv. Sort by the `group` value first, then, within each group, sort by `n` value.

In [134]:
df.sort(df.group, df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.5509569211976624|    x| true|
| -1.2768778466596493|    x| true|
| -0.6625811650314397|    x| true|
|-0.44151223119024857|    x| true|
| 0.12673002941390707|    x|false|
|  0.9552466924396553|    x|false|
|  1.2690580976875576|    x| true|
| -1.6642564360602479|    y|false|
| -1.0669628403806435|    y|false|
| -0.3848490502028002|    y|false|
| -0.1217464334245328|    y|false|
| 0.01997092245414585|    y| true|
|0.041447402165011596|    y| true|
| -1.4141544662915098|    z|false|
| -0.3051810275934747|    z| true|
|  0.2572490050399525|    z|false|
|  0.5253295997343811|    z| true|
|  0.7435452461317569|    z| true|
|  0.8628168935353443|    z| true|
|  3.4011057189806557|    z| true|
+--------------------+-----+-----+



v. Sort by `abool`, `group`, and `n`. Does it matter in what order you specify the columns when sorting?

In [135]:
df.sort(df.abool, df.group, df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.12673002941390707|    x|false|
|  0.9552466924396553|    x|false|
| -1.6642564360602479|    y|false|
| -1.0669628403806435|    y|false|
| -0.3848490502028002|    y|false|
| -0.1217464334245328|    y|false|
| -1.4141544662915098|    z|false|
|  0.2572490050399525|    z|false|
| -1.5509569211976624|    x| true|
| -1.2768778466596493|    x| true|
| -0.6625811650314397|    x| true|
|-0.44151223119024857|    x| true|
|  1.2690580976875576|    x| true|
| 0.01997092245414585|    y| true|
|0.041447402165011596|    y| true|
| -0.3051810275934747|    z| true|
|  0.5253295997343811|    z| true|
|  0.7435452461317569|    z| true|
|  0.8628168935353443|    z| true|
|  3.4011057189806557|    z| true|
+--------------------+-----+-----+



In [136]:
df.sort(df.n, df.abool, df.group).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.6642564360602479|    y|false|
| -1.5509569211976624|    x| true|
| -1.4141544662915098|    z|false|
| -1.2768778466596493|    x| true|
| -1.0669628403806435|    y|false|
| -0.6625811650314397|    x| true|
|-0.44151223119024857|    x| true|
| -0.3848490502028002|    y|false|
| -0.3051810275934747|    z| true|
| -0.1217464334245328|    y|false|
| 0.01997092245414585|    y| true|
|0.041447402165011596|    y| true|
| 0.12673002941390707|    x|false|
|  0.2572490050399525|    z|false|
|  0.5253295997343811|    z| true|
|  0.7435452461317569|    z| true|
|  0.8628168935353443|    z| true|
|  0.9552466924396553|    x|false|
|  1.2690580976875576|    x| true|
|  3.4011057189806557|    z| true|
+--------------------+-----+-----+



In [137]:
df.sort(df.group, df.n, df.abool).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.5509569211976624|    x| true|
| -1.2768778466596493|    x| true|
| -0.6625811650314397|    x| true|
|-0.44151223119024857|    x| true|
| 0.12673002941390707|    x|false|
|  0.9552466924396553|    x|false|
|  1.2690580976875576|    x| true|
| -1.6642564360602479|    y|false|
| -1.0669628403806435|    y|false|
| -0.3848490502028002|    y|false|
| -0.1217464334245328|    y|false|
| 0.01997092245414585|    y| true|
|0.041447402165011596|    y| true|
| -1.4141544662915098|    z|false|
| -0.3051810275934747|    z| true|
|  0.2572490050399525|    z|false|
|  0.5253295997343811|    z| true|
|  0.7435452461317569|    z| true|
|  0.8628168935353443|    z| true|
|  3.4011057189806557|    z| true|
+--------------------+-----+-----+



**A:** It does matter as it determines in what order they will be sorted. When the values for the first specified column are the same, the next specified column will determine sort order.

#### 8. Spark SQL

i. Use the starter code above to re-create a spark dataframe named `df`.

In [138]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.01842180913757...|    y| true|
| 0.10767464455391365|    z| true|
|-0.15579970816999872|    y| true|
| -1.4634777566193453|    x|false|
| -0.5334945027167617|    x|false|
+--------------------+-----+-----+
only showing top 5 rows



ii. Turn your dataframe into a table that can be queried with spark SQL. Name the table `my_df`. Answer the rest of the questions in this section with a spark sql query (`spark.sql`) against `my_df`. After each step, view the first 7 records from the dataframe.


In [144]:
df.createOrReplaceTempView('my_df')

iii. Write a query that shows all of the columns from your dataframe.

In [145]:
spark.sql("""
SELECT * FROM my_df
""").show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.01842180913757...|    y| true|
| 0.10767464455391365|    z| true|
|-0.15579970816999872|    y| true|
| -1.4634777566193453|    x|false|
| -0.5334945027167617|    x|false|
| -1.9323560015091858|    z| true|
|  0.9399792195744116|    x| true|
+--------------------+-----+-----+
only showing top 7 rows



iv. Write a query that shows just the `n` and `abool` columns from the dataframe.

In [147]:
spark.sql("""
SELECT my_df.n, my_df.abool FROM my_df
""").show(7, truncate = False)

+---------------------+-----+
|n                    |abool|
+---------------------+-----+
|-0.018421809137576996|true |
|0.10767464455391365  |true |
|-0.15579970816999872 |true |
|-1.4634777566193453  |false|
|-0.5334945027167617  |false|
|-1.9323560015091858  |true |
|0.9399792195744116   |true |
+---------------------+-----+
only showing top 7 rows



v. Write a query that shows just the `n` and `group` columns. Rename the `group` column to `g`.

In [150]:
spark.sql("""
SELECT my_df.n, my_df.group AS g FROM my_df
""").show(7, truncate = False)

+---------------------+---+
|n                    |g  |
+---------------------+---+
|-0.018421809137576996|y  |
|0.10767464455391365  |z  |
|-0.15579970816999872 |y  |
|-1.4634777566193453  |x  |
|-0.5334945027167617  |x  |
|-1.9323560015091858  |z  |
|0.9399792195744116   |x  |
+---------------------+---+
only showing top 7 rows



vi. Write a query that selects `n`, and creates two new columns: `n2`, the original `n` values halved, and `n3`: the original `n` values minus 1.

In [153]:
spark.sql("""
SELECT my_df.n, my_df.n/2 AS n2, my_df.n - 1 AS n3 FROM my_df
""").show(7, truncate = False)

+---------------------+---------------------+--------------------+
|n                    |n2                   |n3                  |
+---------------------+---------------------+--------------------+
|-0.018421809137576996|-0.009210904568788498|-1.018421809137577  |
|0.10767464455391365  |0.053837322276956825 |-0.8923253554460864 |
|-0.15579970816999872 |-0.07789985408499936 |-1.1557997081699987 |
|-1.4634777566193453  |-0.7317388783096727  |-2.463477756619345  |
|-0.5334945027167617  |-0.26674725135838084 |-1.5334945027167617 |
|-1.9323560015091858  |-0.9661780007545929  |-2.932356001509186  |
|0.9399792195744116   |0.4699896097872058   |-0.06002078042558845|
+---------------------+---------------------+--------------------+
only showing top 7 rows



vii. What happens if you make a SQL syntax error in your query?

In [154]:
spark.sql("""
SELECT my_df.n, my_df.n/2 AS n2, my_df.n - 1 AS n3
""").show(7, truncate = False)

AnalysisException: cannot resolve 'my_df.n' given input columns: []; line 2 pos 7;
'Project ['my_df.n, ('my_df.n / 2) AS n2#1722, ('my_df.n - 1) AS n3#1723]
+- OneRowRelation


#### 9. Aggregating

i. Use the starter code above to re-create a spark dataframe named `df`.

In [155]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.5248595180693141|    y|false|
| 0.06669160250137181|    y| true|
|  2.8140657814433814|    z| true|
|   0.903404851948607|    y|false|
|-0.25281199164778545|    y|false|
+--------------------+-----+-----+
only showing top 5 rows



ii. What is the average `n` value for each group in the `group` column?

In [159]:
df.groupBy(df.group).agg(round(avg(df.n), 4).alias('avg value of n')).show()

+-----+--------------+
|group|avg value of n|
+-----+--------------+
|    y|        0.4138|
|    z|        0.3919|
|    x|        1.2398|
+-----+--------------+



iii. What is the maximum `n` value for each group in the `group` column?

In [163]:
df.groupBy(df.group).agg(round(max(df.n), 2).alias('max value of n')).show()

+-----+--------------+
|group|max value of n|
+-----+--------------+
|    y|           1.9|
|    z|          2.81|
|    x|          2.41|
+-----+--------------+



iv. What is the minimum `n` value by `abool`?

In [164]:
df.groupBy(df.group).agg(round(min(df.n), 2).alias('min value of n')).show()

+-----+--------------+
|group|min value of n|
+-----+--------------+
|    y|         -0.55|
|    z|         -0.97|
|    x|         -0.05|
+-----+--------------+



v. What is the average `n` value for each unique combination of the `group` and `abool` column?

In [165]:
df.groupBy(df.group, df.abool).agg(round(avg(df.n), 2).alias('avg value of n')).show()

+-----+-----+--------------+
|group|abool|avg value of n|
+-----+-----+--------------+
|    y|false|           0.2|
|    y| true|          0.67|
|    z| true|          0.73|
|    x|false|          1.51|
|    x| true|          1.15|
|    z|false|         -0.97|
+-----+-----+--------------+



22/02/14 16:46:43 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1523153 ms exceeds timeout 120000 ms
22/02/14 16:46:44 WARN SparkContext: Killing executors is not supported by current scheduler.
