In [123]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql.functions import col, expr, lit, min, max, concat, sum, avg, count, mean


np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

# Spark Dataframe Basics

- Use the starter code above to create a pandas dataframe.


- Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.


- Show the first 3 rows of the dataframe.


- Show the first 7 rows of the dataframe.


- View a summary of the data using .describe.


- Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.


- Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.


- Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.


- Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [5]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [7]:
df = spark.createDataFrame(pandas_dataframe)

In [8]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [9]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [20]:
# shape of df

df.count(), len(df.columns)

(20, 3)

In [11]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



In [12]:
df.select(df.n, df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [13]:
df.select(df.group, df.abool).show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [15]:
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [18]:
df.select(df.group, df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



# Column Manipulation

- Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df


- Use .select to add 4 to the n column. Show the results.


- Subtract 5 from the n column and view the results.


- Multiply the n column by 2. View the results along with the original numbers.


- Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.


- Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.


- What happens when you run the code below?

        df.group + df.abool
        
        
- What happens when you run the code below? What is the difference between this and the previous code sample?

        df.select(df.group + df.abool)


- Try adding various other columns together. What are the results of combining the different data types?

In [21]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [23]:
df = spark.createDataFrame(pandas_dataframe)

In [27]:
df.select(df.n + 4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
+------------------+
only showing top 5 rows



In [28]:
df.select(df.n - 5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
+-------------------+
only showing top 5 rows



In [29]:
df.select(df.n, df.n * 2).show(5)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
+--------------------+--------------------+
only showing top 5 rows



In [40]:
n2 = col('n') * -1
df.select(df.n, n2).show(4)

+--------------------+--------------------+
|                   n|            (n * -1)|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
+--------------------+--------------------+
only showing top 4 rows



In [67]:
# with expr

df.select(
    expr('n'),
    expr('n * -1 AS n2')
).show(5)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
|  1.3451017084510097| -1.3451017084510097|
+--------------------+--------------------+
only showing top 5 rows



In [68]:
n3 = col('n') ** 2
df.select(df.n, n2, n3).show(5)

+--------------------+--------------------+--------------------+
|                   n|            (n * -1)|         POWER(n, 2)|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [77]:
# with expr

df.select(
    expr('n'), 
    expr('n * -1 AS n2'),
    expr('n ** 2 AS n3')
).show(5)

ParseException: "\nmismatched input '2' expecting <EOF>(line 1, pos 5)\n\n== SQL ==\nn ** 2 AS n3\n-----^^^\n"

In [52]:
# another way to rename cols

df.selectExpr('group as group','abool as abool_diff').show(3)

+-----+----------+
|group|abool_diff|
+-----+----------+
|    z|     false|
|    x|     false|
|    z|     false|
+-----+----------+
only showing top 3 rows



In [42]:
df.group + df.abool

Column<b'(group + abool)'>

In [45]:
df.show(2)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|-0.712390662050588|    z|false|
| 0.753766378659703|    x|false|
+------------------+-----+-----+
only showing top 2 rows



In [47]:
# adding a number and a string

df.select(df.group + df.n).show(5)

+-----------+
|(group + n)|
+-----------+
|       null|
|       null|
|       null|
|       null|
|       null|
+-----------+
only showing top 5 rows



# Spark SQL

- Use the starter code above to re-create a spark dataframe.


- Turn your dataframe into a table that can be queried with spark SQL. Name the table my_df. Answer the rest of the questions in this section with a spark sql query (spark.sql) against my_df. After each step, view the first 7 records from the dataframe.


- Write a query that shows all of the columns from your dataframe.


- Write a query that shows just the n and abool columns from the dataframe.


- Write a query that shows just the n and group columns. Rename the group column to g.


- Write a query that selects n, and creates two new columns: n2, the original n values halved, and n3: the original n values minus 1.


- What happens if you make a SQL syntax error in your query?

In [60]:
df = spark.createDataFrame(pandas_dataframe)

In [78]:
df.createOrReplaceTempView('my_df')

In [79]:
spark.sql(
    """
SELECT *
FROM my_df
    """
).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [80]:
spark.sql("""
SELECT n, abool
FROM my_df
""").show(7)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
|  0.5323378882945463|false|
|  1.3501878997225267|false|
+--------------------+-----+
only showing top 7 rows



In [81]:
spark.sql("""
SELECT n, group AS g
FROM my_df
""").show(7)

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  -0.712390662050588|  z|
|   0.753766378659703|  x|
|-0.04450307833805...|  z|
| 0.45181233874578974|  y|
|  1.3451017084510097|  z|
|  0.5323378882945463|  y|
|  1.3501878997225267|  z|
+--------------------+---+
only showing top 7 rows



In [83]:
# Write a query that selects n, and creates two new columns: 
# n2,the original n values halved, and n3: the original n values minus 1.

spark.sql("""
SELECT n, (n / 2) AS n2, (n - 1) AS n3
FROM my_df
""").show(7)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -0.356195331025294|  -1.712390662050588|
|   0.753766378659703|  0.3768831893298515|-0.24623362134029703|
|-0.04450307833805...|-0.02225153916902...| -1.0445030783380536|
| 0.45181233874578974| 0.22590616937289487| -0.5481876612542103|
|  1.3451017084510097|  0.6725508542255049| 0.34510170845100974|
|  0.5323378882945463| 0.26616894414727316| -0.4676621117054537|
|  1.3501878997225267|  0.6750939498612634| 0.35018789972252673|
+--------------------+--------------------+--------------------+
only showing top 7 rows



# Type casting

- Use the starter code above to re-create a spark dataframe.


- Use .printSchema to view the datatypes in your dataframe.


- Use .dtypes to view the datatypes in your dataframe.


- What is the difference between the two code samples below?

        df.abool.cast('int')
        df.select(df.abool.cast('int')).show()


- Use .select and .cast to convert the abool column to an integer type. View the results.


- Convert the group column to a integer data type and view the results. What happens?


- Convert the n column to a integer data type and view the results. What happens?


- Convert the abool column to a string data type and view the results. What happens?

In [85]:
df = spark.createDataFrame(pandas_dataframe)

In [90]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



In [88]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [95]:
df.abool.cast('int')

Column<b'CAST(abool AS INT)'>

In [99]:
df.select(df.abool.cast('int')).show(5)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 5 rows



In [101]:
df.select(df.abool.cast('int')).dtypes

[('abool', 'int')]

In [102]:
df.select(df.group.cast('int')).show(5)

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 5 rows



In [105]:
# casting to an int truncates the number

df.select(df.n, df.n.cast('int')).show(5)

+--------------------+---+
|                   n|  n|
+--------------------+---+
|  -0.712390662050588|  0|
|   0.753766378659703|  0|
|-0.04450307833805...|  0|
| 0.45181233874578974|  0|
|  1.3451017084510097|  1|
+--------------------+---+
only showing top 5 rows



In [112]:
# casting the abool column to a string instead of a bool

df.select(df.abool.cast('string')).printSchema()

root
 |-- abool: string (nullable = true)



# Built-in Functions

- Use the starter code above to re-create a spark dataframe.


- Import the necessary functions from pyspark.sql.functions


- Find the highest n value.


- Find the lowest n value.


- Find the average n value.


- Use concat to change the group column to say, e.g. "Group: x" or "Group: y"


- Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [109]:
df = spark.createDataFrame(pandas_dataframe)

In [118]:
df.select(max(df.n)).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



In [119]:
df.select(min(df.n)).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



In [120]:
df.select(avg(df.n)).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+



In [126]:
#df.show(2)
df.select(concat(lit('Group: '), df.group)).show()

+----------------------+
|concat(Group: , group)|
+----------------------+
|              Group: z|
|              Group: x|
|              Group: z|
|              Group: y|
|              Group: z|
|              Group: y|
|              Group: z|
|              Group: x|
|              Group: z|
|              Group: y|
|              Group: x|
|              Group: y|
|              Group: y|
|              Group: y|
|              Group: y|
|              Group: x|
|              Group: z|
|              Group: y|
|              Group: x|
|              Group: x|
+----------------------+



In [130]:
from pyspark.sql.functions import round

df.select(concat(df.group, lit(': '), round(df.n, 3))).show()

+------------------------------+
|concat(group, : , round(n, 3))|
+------------------------------+
|                     z: -0.712|
|                      x: 0.754|
|                     z: -0.045|
|                      y: 0.452|
|                      z: 1.345|
|                      y: 0.532|
|                       z: 1.35|
|                      x: 0.861|
|                      z: 1.479|
|                     y: -1.045|
|                     x: -0.789|
|                     y: -1.262|
|                      y: 0.563|
|                     y: -0.243|
|                      y: 0.914|
|                      x: 0.317|
|                      z: 0.127|
|                       y: 2.15|
|                      x: 0.606|
|                     x: -0.027|
+------------------------------+



In [132]:
df.select(
    concat(lit('Group: '), df.group),
    concat(df.group, lit(': '), round(df.n, 3))
).show(5)

+----------------------+------------------------------+
|concat(Group: , group)|concat(group, : , round(n, 3))|
+----------------------+------------------------------+
|              Group: z|                     z: -0.712|
|              Group: x|                      x: 0.754|
|              Group: z|                     z: -0.045|
|              Group: y|                      y: 0.452|
|              Group: z|                      z: 1.345|
+----------------------+------------------------------+
only showing top 5 rows



# Filter / Where

- Use the starter code above to re-create a spark dataframe.


- Use .filter or .where to select just the rows where the group is y and view the results.


- Select just the columns where the abool column is false and view the results.


- Find the columns where the group column is not y.


- Find the columns where n is positive.


- Find the columns where abool is true and the group column is z.


- Find the columns where abool is true or the group column is z.


- Find the columns where abool is false and n is less than 1


- Find the columns where abool is false or n is less than 1


In [133]:
df = spark.createDataFrame(pandas_dataframe)

In [137]:
# filter using filter for rows where the group is y

df.filter(df.group == 'y').show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|0.45181233874578974|    y|false|
| 0.5323378882945463|    y|false|
|-1.0453771305385342|    y| true|
| -1.261605945319069|    y|false|
| 0.5628467852810314|    y| true|
+-------------------+-----+-----+
only showing top 5 rows



In [138]:
# filter using where for rows where the group is y

df.where(df.group == 'y').show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|0.45181233874578974|    y|false|
| 0.5323378882945463|    y|false|
|-1.0453771305385342|    y| true|
| -1.261605945319069|    y|false|
| 0.5628467852810314|    y| true|
+-------------------+-----+-----+
only showing top 5 rows



In [140]:
# filter for rows where abool is False

df.filter(df.abool == False).show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [141]:
# filter for rows where the group is not y

df.filter(df.group != 'y').show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [142]:
# filter for rows where n is positive

df.filter(df.n > 0).show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
+-------------------+-----+-----+
only showing top 5 rows



In [144]:
# Find the columns where abool is true and the group column is z.

df.filter(df.group == 'z').where(df.abool == True).show(5)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



In [149]:
# Find the columns where abool is true OR the group column is z.

df.filter((df.group == 'z') | (df['abool'] == True)).show(10)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+
only showing top 10 rows



In [151]:
# Find the columns where abool is false and n is less than 1

df.filter((df.abool == False) & (df.n < 1)).show(10)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
+--------------------+-----+-----+
only showing top 10 rows



In [152]:
# Find the columns where abool is false or n is less than 1

df.filter((df.abool == False) | (df.n < 1)).show(10)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
+--------------------+-----+-----+
only showing top 10 rows



# When / Otherwise

- Use the starter code above to re-create a spark dataframe.


- Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.


- Create a column that contains 0 if n is less than 0, otherwise, the original n value.


In [155]:
from pyspark.sql.functions import when

df = spark.createDataFrame(pandas_dataframe)

In [161]:
# Use when and .otherwise to create a column that contains the text 
#"It is true" when abool is True and "It is false"" when abool is False.

df.select(
    df.abool, when(df.abool == True, 'It is true')
    .otherwise('It is false').alias('True or False?')
).show(10)

+-----+--------------+
|abool|True or False?|
+-----+--------------+
|false|   It is false|
|false|   It is false|
|false|   It is false|
|false|   It is false|
|false|   It is false|
|false|   It is false|
|false|   It is false|
|false|   It is false|
| true|    It is true|
| true|    It is true|
+-----+--------------+
only showing top 10 rows



In [163]:
# Create a column that contains 0 if n is less than 0, 
# otherwise, the original n value.

df.select(
    when(df.n > 0, df.n)
    .otherwise(0)
).show(10)

+-----------------------------------+
|CASE WHEN (n > 0) THEN n ELSE 0 END|
+-----------------------------------+
|                                0.0|
|                  0.753766378659703|
|                                0.0|
|                0.45181233874578974|
|                 1.3451017084510097|
|                 0.5323378882945463|
|                 1.3501878997225267|
|                 0.8612113741693206|
|                 1.4786857374358966|
|                                0.0|
+-----------------------------------+
only showing top 10 rows



# Sorting

- Use the starter code above to re-create a spark dataframe.


- Sort by the n value.


- Sort by the group value, both ascending and descending.


- Sort by the group value first, then, within each group, sort by n value.


- Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?