# Chapter 3:
Christoph Windheuser    
March 19, 2022   
Python examples of chapter 3 in the book *Learning Spark*

In [1]:
# Import required python spark libraries
import findspark
import pyspark

from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when, concat, lit
from pyspark.sql import SparkSession


In [2]:
# Connect Jupyter Notebook with the Spark application and create Spark Context
findspark.init()
sc = pyspark.SparkContext(appName="chapter_3")


In [3]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("Example-3_6")
       .getOrCreate())


In [18]:
# Define the schema
# define schema for our data
""""
schema = (StructType([
   StructField("Id", IntegerType(), False),
   StructField("First", StringType(), False),
   StructField("Last", StringType(), False),
   StructField("Url", StringType(), False),
   StructField("Published", StringType(), False),
   StructField("Hits", IntegerType(), False),
   StructField("Campaigns", ArrayType(StringType()), False)]))
   """

ddl_schema = "`Id` INT,`First` STRING,`Last` STRING,`Url` STRING,`Published` STRING,`Hits` INT,`Campaigns` ARRAY<STRING>"


In [19]:
# create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]


In [21]:
# create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, ddl_schema)
# show the DataFrame; it should reflect our table above
blogs_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [25]:
# create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, ddl_schema)


In [26]:
# print the schema used by Spark to process the DataFrame
print(blogs_df.printSchema())


root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)

None


In [27]:
blogs_df.createOrReplaceTempView("blogs")

In [28]:
blogs_df.select(expr("Hits") * 2).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



In [29]:
blogs_df.select(expr("Hits") + expr("Id")).show(truncate=False)

+-----------+
|(Hits + Id)|
+-----------+
|4536       |
|8910       |
|7662       |
|10572      |
|40583      |
|25574      |
+-----------+



In [30]:
blogs_df.withColumn("Big Hitters", (expr("Hits") > 10000)).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [31]:
blogs_df.withColumn("AuthorsId", (concat(expr("First"), expr("Last"), expr("Id")))).select(expr("AuthorsId")).show(n=4)


+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
+-------------+
only showing top 4 rows



# Page 53: Read the data from a json file

In [33]:
jsonFile = "blogs.json"

blogs2_df =  spark.read.schema(ddl_schema).json(jsonFile)


In [34]:
blogs2_df.show(2)


+---+------+-----+-----------------+---------+----+-------------------+
| Id| First| Last|              Url|Published|Hits|          Campaigns|
+---+------+-----+-----------------+---------+----+-------------------+
|  1| Jules|Damji|https://tinyurl.1| 1/4/2016|4535|[twitter, LinkedIn]|
|  2|Brooke|Wenig|https://tinyurl.2| 5/5/2018|8908|[twitter, LinkedIn]|
+---+------+-----+-----------------+---------+----+-------------------+
only showing top 2 rows



In [35]:
blogs2_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



# Chapter 3: Columns, Rows and Expressions
March 22 2022

In [36]:
from pyspark.sql import Row

blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", ["twitter", "LinkedIn"])


In [37]:
#access individual items of the row unsing the index:
blog_row[1]

'Reynold'

## Create DataFrames out of rows

In [38]:
rows = [Row("Matai Zaharia", "CA"), Row("Reynold Xin", "MA")]
authors_df = spark.createDataFrame (rows, ["Authors", "State"])
authors_df.show()


+-------------+-----+
|      Authors|State|
+-------------+-----+
|Matai Zaharia|   CA|
|  Reynold Xin|   MA|
+-------------+-----+



In [39]:
import pyspark.sql.functions

In [40]:
# Show all columns in a list
blogs2_df.columns

['Id', 'First', 'Last', 'Url', 'Published', 'Hits', 'Campaigns']

In [41]:
blogs2_df["Id"]

Column<'Id'>

In [42]:
# Use an expression to compute a value:
# blogs2_df.col("Hits") * 2 - FALSCH
blogs2_df.select(expr("Hits") * 2).show(2)


+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows

