# Basic Structured Operations

## Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

## Create Spark Session

In [2]:
spark = SparkSession\
    .builder\
    .appName("Test App")\
    .master("local[*]")\
    .getOrCreate()

## Read Data

In [3]:
df = spark.read\
    .format("json")\
    .load("../../data/flight-data/json/2015-summary.json")

In [4]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [5]:
manual_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), True, metadata={"hello": "world"})
])

In [6]:
manual_schema == df.schema

False

In [7]:
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [8]:
df = spark.read.format("json")\
    .schema(manual_schema)\
    .load("../../data/flight-data/json/2015-summary.json")

## Operations

### Columns

In [9]:
F.col("hello")

Column<b'hello'>

In [10]:
F.column("hello")

Column<b'hello'>

In [11]:
# get a specific col
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

Columns are expressions which are just transformations

In [17]:
df.select(F.col('count')).show(3)

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
+-----+
only showing top 3 rows



In [18]:
df.select(F.expr('count')).show(3)

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
+-----+
only showing top 3 rows



In [22]:
df.select(F.col('count') - 5).show(3)

+-----------+
|(count - 5)|
+-----------+
|         10|
|         -4|
|        339|
+-----------+
only showing top 3 rows



Is the same as:

In [23]:
df.select(F.expr('count - 5')).show(3)

+-----------+
|(count - 5)|
+-----------+
|         10|
|         -4|
|        339|
+-----------+
only showing top 3 rows



and it's the same as:

In [24]:
df.select(F.expr('count') - 5).show(3)

+-----------+
|(count - 5)|
+-----------+
|         10|
|         -4|
|        339|
+-----------+
only showing top 3 rows



### Retail Data

In [94]:
spark.catalog.clearCache()

In [106]:
retail_data = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv("../../data/retail-data/by-day/*.csv")\
    .cache()

In [110]:
retail_data.count()

541909

In [111]:
retail_data.show(10)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
|   580538|    21544|SKULLS  WATER TRA...|      48|2011-12-05 08:38:00|     0.85|   14075.0|United Kingdom|
|   580538|    23126|FELTCRA

In [112]:
result = retail_data\
    .withColumn("TOTAL", F.expr("UnitPrice * Quantity"))\
    .groupBy("CustomerID")\
    .mean("TOTAL")\
    .withColumnRenamed("avg(TOTAL)", "AvgCost")\
    .orderBy(F.col('AvgCost').desc_nulls_first())\
    .limit(10)

In [113]:
result.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=10, orderBy=[AvgCost#3214 DESC NULLS FIRST], output=[CustomerID#2448,AvgCost#3214])
+- *(2) HashAggregate(keys=[CustomerID#2448], functions=[avg(TOTAL#3191)])
   +- Exchange hashpartitioning(CustomerID#2448, 200), true, [id=#1415]
      +- *(1) HashAggregate(keys=[knownfloatingpointnormalized(normalizenanandzero(CustomerID#2448)) AS CustomerID#2448], functions=[partial_avg(TOTAL#3191)])
         +- *(1) Project [CustomerID#2448, (UnitPrice#2447 * cast(Quantity#2445 as double)) AS TOTAL#3191]
            +- InMemoryTableScan [CustomerID#2448, Quantity#2445, UnitPrice#2447]
                  +- InMemoryRelation [InvoiceNo#2442, StockCode#2443, Description#2444, Quantity#2445, InvoiceDate#2446, UnitPrice#2447, CustomerID#2448, Country#2449], StorageLevel(disk, memory, deserialized, 1 replicas)
                        +- FileScan csv [InvoiceNo#2442,StockCode#2443,Description#2444,Quantity#2445,InvoiceDate#2446,UnitPrice#2447,CustomerID#2448,

In [114]:
result.show()

+----------+------------------+
|CustomerID|           AvgCost|
+----------+------------------+
|   15195.0|            3861.0|
|   13135.0|            3096.0|
|   17846.0|            2033.1|
|   16532.0|1687.1999999999998|
|   15749.0|1435.7266666666667|
|   16000.0|1377.0777777777778|
|   16754.0|            1001.2|
|   12798.0|            872.13|
|   17553.0|             743.8|
|   17949.0| 667.7321518987342|
+----------+------------------+



In [150]:
from pyspark.sql import Row

In [151]:
new_row = Row("Hello", "World", 1, 2.3, True)

In [152]:
new_row

<Row('Hello', 'World', 1, 2.3, True)>