# Chapter 3:
Christoph Windheuser    
April 5, 2022   
Python examples of chapter 3 in the book *Learning Spark*

In [12]:
# Import required python spark libraries
import findspark
import pyspark

from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when, concat, lit, avg
from pyspark.sql import SparkSession


In [2]:
# Connect Jupyter Notebook with the Spark application and create Spark Context
findspark.init()
sc = pyspark.SparkContext(appName="chapter_3")


In [3]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("Example-3_6")
       .getOrCreate())


## Example page 45 ff
We want to solve a simple data analytics task.    
We have the following data points of persons and their age:
* Brooke: 20
* Denny: 31
* Jules: 30
* TD: 35
* Brooke: 24

Be aware that there are thwo Brookes with different ages.   
The task is to summarize the datapoints by name and average over their ages.

First we solve it with an RDD ((Resilient Distributed Dataset).    


In [4]:
# Create the RDD containing the data
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31),
                          ("Jules", 30), ("TD", 35), ("Brooke", 25)])


In [5]:
# Calculate the average age per name
agesRDD = (dataRDD
          .map(lambda x: (x[0], (x[1],1)))
          .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
          .map(lambda x: (x[0], x[1][0] / x[1][1])))


In [11]:
# Show results
print (agesRDD.take(4))

[('Brooke', 22.5), ('Denny', 31.0), ('TD', 35.0), ('Jules', 30.0)]


Now we solve the same task with Sparks high-level Domain Specific Languages (DSL - Python in our case). We are using Sparks DataFrame API to tell Spark *what to do* instead of *how to do it* as in the previous code with RDDs.

In [14]:
# Create a DataFrame
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31),
                          ("Jules", 30), ("TD", 35), ("Brooke", 25)], ["name", "age"])


In [15]:
# Group by names, aggregate their ages and average the age
avg_df = data_df.groupBy("name").agg(avg("age"))

In [16]:
# Show the results
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Denny|    31.0|
| Jules|    30.0|
|    TD|    35.0|
+------+--------+



In [None]:
# Define the schema
# define schema for our data
""""
schema = (StructType([
   StructField("Id", IntegerType(), False),
   StructField("First", StringType(), False),
   StructField("Last", StringType(), False),
   StructField("Url", StringType(), False),
   StructField("Published", StringType(), False),
   StructField("Hits", IntegerType(), False),
   StructField("Campaigns", ArrayType(StringType()), False)]))
   """

ddl_schema = "`Id` INT,`First` STRING,`Last` STRING,`Url` STRING,`Published` STRING,`Hits` INT,`Campaigns` ARRAY<STRING>"


In [None]:
# create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]


In [None]:
# create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, ddl_schema)
# show the DataFrame; it should reflect our table above
blogs_df.show()

In [None]:
# create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, ddl_schema)


In [None]:
# print the schema used by Spark to process the DataFrame
print(blogs_df.printSchema())


In [None]:
blogs_df.createOrReplaceTempView("blogs")

In [None]:
blogs_df.select(expr("Hits") * 2).show(2)

In [None]:
blogs_df.select(expr("Hits") + expr("Id")).show(truncate=False)

In [None]:
blogs_df.withColumn("Big Hitters", (expr("Hits") > 10000)).show()

In [None]:
blogs_df.withColumn("AuthorsId", (concat(expr("First"), expr("Last"), expr("Id")))).select(expr("AuthorsId")).show(n=4)


# Page 53: Read the data from a json file

In [None]:
jsonFile = "blogs.json"

blogs2_df =  spark.read.schema(ddl_schema).json(jsonFile)


In [None]:
blogs2_df.show(2)


In [None]:
blogs2_df.printSchema()

# Chapter 3: Columns, Rows and Expressions
March 22 2022

In [None]:
from pyspark.sql import Row

blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", ["twitter", "LinkedIn"])


In [None]:
#access individual items of the row unsing the index:
blog_row[1]

## Create DataFrames out of rows

In [None]:
rows = [Row("Matai Zaharia", "CA"), Row("Reynold Xin", "MA")]
authors_df = spark.createDataFrame (rows, ["Authors", "State"])
authors_df.show()


In [None]:
import pyspark.sql.functions

In [None]:
# Show all columns in a list
blogs2_df.columns

In [None]:
blogs2_df["Id"]

In [None]:
# Use an expression to compute a value:
# blogs2_df.col("Hits") * 2 - FALSCH
blogs2_df.select(expr("Hits") * 2).show(2)
