# Init Spark

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Spark Basics") \
    .getOrCreate()

# Prepare Data

This content is from the book [Spark-The-Definitive-Guide](https://github.com/databricks/Spark-The-Definitive-Guide)

In [None]:
!wget https://github.com/databricks/Spark-The-Definitive-Guide/archive/refs/heads/master.zip

In [None]:
!mv master.zip book.zip

In [None]:
!unzip -qq book.zip

## Path Setup

In [None]:
import os
path = os.getcwd()
print(path)

## A Gentle Introduction to Spark / Chapter 2

### Creating DFs

In [None]:
myRange = spark.range(1000).toDF("number")

In [None]:
divisBy2 = myRange.where("number % 2 = 0")

In [None]:
flightData2015 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv(f"file:///{path}/Spark-The-Definitive-Guide-master/data/flight-data/csv/2015-summary.csv")

### SQL vs DF

In [None]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [None]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [None]:
dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()

In [None]:
sqlWay.explain()

In [None]:
dataFrameWay.explain()

### Aggregates

In [None]:
from pyspark.sql.functions import max
# https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.functions.max.html

# SELECT max(count) FROM flight_data_2015 LIMIT 1
flightData2015.select(max("count")).take(1)

In [None]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

In [None]:
maxSql.show()

In [None]:
from pyspark.sql.functions import desc

flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .show()


## Structured API Overview/ Chapter 4

In [None]:
df = spark.range(500).toDF("number")

In [None]:
df.select(df["number"] + 10).show(3)

In [None]:
df.select(df.number + 10).show(3)

In [None]:
from pyspark.sql.functions import col
df.select(col("number") + 10).show(3)

In [None]:
from pyspark.sql.functions import expr
df.select(expr("number + 10")).show(3)

In [None]:
df.selectExpr("number + 10").show(3)

In [None]:
df.limit(4).collect()

## Basic Structured Operations / Chapter 5

### Schemas

In [None]:
df = spark.read.format("json").\
    load(f"file:///{path}/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")

In [None]:
df.schema

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False)
])
df = spark.read.format("json").schema(myManualSchema)\
  .load(f"file:///{path}/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")


### Columns and Expressions

In [None]:
from pyspark.sql.functions import col, column
print(col("someColumnName"))
print(column("someColumnName"))

In [None]:
from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

### Rows

In [None]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

In [None]:
print(myRow[0])
print(myRow[2])

### DataFrame Transformations

In [None]:
df = spark.read.format("json").\
    load(f"file:///{path}/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")
df.createOrReplaceTempView("dfTable")

#### Select and SelectExpr

In [None]:
df.select("DEST_COUNTRY_NAME").show(2)


In [None]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

In [None]:
from pyspark.sql.functions import expr, col, column
df.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME"))\
  .show(2)

In [None]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)


In [None]:
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME"))\
  .show(2)

In [None]:
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

In [None]:
df.selectExpr(
  "*", # all original columns
  "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
  .show(2)

In [None]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)


#### Literals

In [None]:
from pyspark.sql.functions import lit
df.select(expr("*"), lit(1).alias("One")).show(2)

In [None]:
df.selectExpr("*", "1 as One").show(2)

#### Adding Columns

In [None]:
df.withColumn("numberOne", lit(1)).show(2)


In [None]:
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))\
  .show(2)


#### Renaming Columns

In [None]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns


In [None]:
dfWithLongColName = df.withColumn(
    "This Long Column-Name",
    expr("ORIGIN_COUNTRY_NAME"))

In [None]:
dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`")\
  .show(2)

In [None]:
dfWithLongColName.select(expr("`This Long Column-Name`")).columns


#### Dropping Columns

In [None]:
df.show(2)

In [None]:
df.drop("ORIGIN_COUNTRY_NAME").show(2)

#### Filtering Rows

In [None]:
# filter == where
df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia")\
  .show(2)

#### Unique Rows

In [None]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

In [None]:
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

#### Random Samples / Random Splits

In [None]:
df.count()

In [None]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

In [None]:
dataFrames = df.randomSplit([0.25, 0.75], seed)
dataFrames[0].count() > dataFrames[1].count() # False

#### Concatenating and Appending Rows (Union)

In [None]:
from pyspark.sql import Row
schema = df.schema
newRows = [
  Row("New Country", "Other Country", 5),
  Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [None]:
df.union(newDF)\
  .where("count = 1")\
  .where(col("ORIGIN_COUNTRY_NAME") != "United States")\
  .show()

#### Sorting Rows

In [None]:
# sort === orderBy
df.sort("count").show(5)
df.("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

In [None]:
from pyspark.sql.functions import desc, asc
df.orderBy(expr("count desc")).show(2)
df.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(2)

#### Limit

In [None]:
df.orderBy(expr("count desc")).limit(2).show(30)

#### Repartition 

In [None]:
df.rdd.getNumPartitions() # 1

In [None]:
df = df.repartition(5)

In [None]:
df.rdd.getNumPartitions() # 5

In [None]:
df.repartition(col("DEST_COUNTRY_NAME"))

In [None]:
df.repartition(5, col("DEST_COUNTRY_NAME"))

#### Collecting Rows To Driver 

In [None]:
collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count

In [None]:
collectDF.show() # this prints it out nicely
collectDF.show(5, False)
collectDF.collect()

## Working with Different Types of Data / Chapter 6

In [None]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load(f"file:///{path}/Spark-The-Definitive-Guide-master/data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

In [None]:
from pyspark.sql.functions import lit
df.select(lit(5), lit("five"), lit(5.0))


### Working with Booleans 

In [None]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
  .select("InvoiceNo", "Description")\
  .show(5, False)

In [None]:
df.where("InvoiceNo <> 536365")\
  .select("InvoiceNo", "Description")\
  .show(5, False)

In [None]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1

df.where("StockCode = 'DOT'").where(priceFilter | descripFilter).show(20)


In [None]:
from pyspark.sql.functions import instr

DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1

df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
  .where("isExpensive")\
  .select("unitPrice", "isExpensive").show(5)


In [None]:
from pyspark.sql.functions import expr
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
  .where("isExpensive")\
  .select("Description", "UnitPrice").show(5)



### Working with Numbers 

In [None]:
from pyspark.sql.functions import expr, pow

fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)

In [None]:
df.selectExpr(
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)


In [None]:
from pyspark.sql.functions import lit, round, bround

df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)


### Stats 

In [None]:
df.stat.corr("Quantity", "UnitPrice")

In [None]:
from pyspark.sql.functions import corr
df.select(corr("Quantity", "UnitPrice")).show()


In [None]:
df.describe().show()

In [None]:
from pyspark.sql.functions import count, mean, stddev_pop, min, max

In [None]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError) # 2.51

In [None]:
df.stat.crosstab("StockCode", "Quantity").show()


In [None]:
df.stat.freqItems(["StockCode", "Quantity"]).show()


In [None]:
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


### Working with Strings 

In [None]:
from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()

In [None]:
from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)

In [None]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 10, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)

### Regexp

In [None]:
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)

In [None]:
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
  .show(2)

In [None]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
     regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
     col("Description")).show(2)

In [None]:
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1

df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
  .where("hasSimpleColor")\
  .select("Description").show(3, False)

### Dates and Timestamps 

In [None]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())


In [None]:
dateDF.printSchema()

In [None]:
from pyspark.sql.functions import date_add, date_sub

dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


In [None]:
from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)


In [None]:
dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)

In [None]:
from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
  .select(to_date(col("date"))).show(1)


In [None]:
from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))

In [None]:
cleanDateDF.show()

In [None]:
from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

### Working with Nulls in Data

In [None]:
df.na.drop("all", subset=["StockCode", "InvoiceNo"])

In [None]:
df.na.fill("all", subset=["StockCode", "InvoiceNo"])

In [None]:
fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)

### Working with Complex Types 

In [None]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))

In [None]:
complexDF.show(2, False)

In [None]:
complexDF.printSchema()

In [None]:
from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2, False)

In [None]:
df.select(split(col("Description"), " ").alias("array_col"))\
  .selectExpr("array_col[0]").show(2)

In [None]:
from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3

In [None]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

In [None]:
from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "InvoiceNo", "exploded").show(2)

In [None]:
from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .show(2,False)

In [None]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

In [None]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("explode(complex_map)").show(2)

In [None]:
jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

In [None]:
jsonDF.show(1, False)

In [None]:
from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
    json_tuple(col("jsonString"), "myJSONKey")).show(2, False)

In [None]:
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))

In [None]:
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct"))).show(20,False)

In [None]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *

parseSchema = StructType((
  StructField("InvoiceNo",StringType(),True),
  StructField("Description",StringType(),True)))

df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")).alias("newJSON"))\
  .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2, False)


## Aggregations / Chapter 7

In [None]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load(f"file:///{path}/Spark-The-Definitive-Guide-master/data/retail-data/all/*.csv")\
  .coalesce(5)
df.cache()

### Counting 

In [None]:
from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909

In [None]:
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070

In [None]:
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


### First and Last 

In [None]:
from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()

#### Min and Max 

In [None]:
from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()

### Sum

In [None]:
from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450

In [None]:
from pyspark.sql.functions import sum_distinct
df.select(sum_distinct("Quantity")).show() # 29310

### Avg

In [None]:
from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

### Variance / STD / Skewness etc

In [None]:
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()

In [None]:
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

In [None]:
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()

### Aggregating to Complex Types

In [None]:
from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

### Grouping 

In [None]:
from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()


In [None]:
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\
  .show()

## Joins / Chapter 8

In [None]:
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])])\
  .toDF("id", "name", "graduate_program", "spark_status")

graduateProgram = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley")])\
  .toDF("id", "degree", "department", "school")

sparkStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")])\
  .toDF("id", "status")

In [None]:
person.show()

In [None]:
graduateProgram.show()

In [None]:
sparkStatus.show()

In [None]:
person.join(graduateProgram, person["graduate_program"] == graduateProgram['id']).show(6)

In [None]:
person.join(graduateProgram, person["graduate_program"] == graduateProgram['id'], "right_outer").show(6)

In [None]:
from pyspark.sql.functions import expr

person.withColumnRenamed("id", "personId")\
  .join(sparkStatus, expr("array_contains(spark_status, id)")).show()