# DataFrame Operations



In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import*

spark = SparkSession.builder.appName('00a DataFrame Operations').getOrCreate()
sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/13 10:25:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Read a CSV into a dataframe, inferring the schema.
dataPath = "../data/Open_Parking_and_Camera_Violations_100.csv"
fines = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema", "true")\
  .load(dataPath)
  
# inferSchema means we will automatically figure out column types 
# at a cost of reading the data more than once

In [3]:
fines.printSchema()

root
 |-- Plate: string (nullable = true)
 |-- State: string (nullable = true)
 |-- License Type: string (nullable = true)
 |-- Summons Number: long (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Violation Time: string (nullable = true)
 |-- Violation: string (nullable = true)
 |-- Judgment Entry Date: string (nullable = true)
 |-- Fine Amount: double (nullable = true)
 |-- Penalty Amount: double (nullable = true)
 |-- Interest Amount: double (nullable = true)
 |-- Reduction Amount: double (nullable = true)
 |-- Payment Amount: double (nullable = true)
 |-- Amount Due: double (nullable = true)
 |-- Precinct: integer (nullable = true)
 |-- County: string (nullable = true)
 |-- Issuing Agency: string (nullable = true)
 |-- Violation Status: string (nullable = true)
 |-- Summons Image: string (nullable = true)



In [4]:
# A schema is a StructType made up of a number of fields, StructFields, that have a name, type, and a Boolean flag which specifies whether that column can contain missing or null values.
fines.schema

StructType(List(StructField(Plate,StringType,true),StructField(State,StringType,true),StructField(License Type,StringType,true),StructField(Summons Number,LongType,true),StructField(Issue Date,StringType,true),StructField(Violation Time,StringType,true),StructField(Violation,StringType,true),StructField(Judgment Entry Date,StringType,true),StructField(Fine Amount,DoubleType,true),StructField(Penalty Amount,DoubleType,true),StructField(Interest Amount,DoubleType,true),StructField(Reduction Amount,DoubleType,true),StructField(Payment Amount,DoubleType,true),StructField(Amount Due,DoubleType,true),StructField(Precinct,IntegerType,true),StructField(County,StringType,true),StructField(Issuing Agency,StringType,true),StructField(Violation Status,StringType,true),StructField(Summons Image,StringType,true)))

In [5]:
fines.show(2)

+-------+-----+------------+--------------+----------+--------------+----------------+-------------------+-----------+--------------+---------------+----------------+--------------+----------+--------+------+-----------------+----------------+--------------------+
|  Plate|State|License Type|Summons Number|Issue Date|Violation Time|       Violation|Judgment Entry Date|Fine Amount|Penalty Amount|Interest Amount|Reduction Amount|Payment Amount|Amount Due|Precinct|County|   Issuing Agency|Violation Status|       Summons Image|
+-------+-----+------------+--------------+----------+--------------+----------------+-------------------+-----------+--------------+---------------+----------------+--------------+----------+--------+------+-----------------+----------------+--------------------+
|1994439|   ME|         PAS|    1438400834|06/05/2018|        07:58A|DETACHED TRAILER|               null|       45.0|          10.0|            0.0|             0.0|           0.0|      55.0|      94|    

In [6]:
fines.select(['Plate', 'Violation']).show(5)

+-------+--------------------+
|  Plate|           Violation|
+-------+--------------------+
|1994439|    DETACHED TRAILER|
|GJK1149|           CROSSWALK|
|HXP4226|REG. STICKER-EXPI...|
|1742308|    DETACHED TRAILER|
| RR2L42|NO STANDING-DAY/T...|
+-------+--------------------+
only showing top 5 rows



Let's see how to manually specify a known schema for a data file, so we can skip the costly "Infer Schema":



In [7]:
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType

schema = StructType([StructField('Plate',StringType(), True),
                     StructField('State',StringType(),True),
                     StructField('License Type',StringType()),
                     StructField('Summons Number',IntegerType()),
                     StructField('Issue Date',StringType()),
                     StructField('Violation Time',StringType()),
                     StructField('Violation',StringType()),
                     StructField('Judgment Entry Date',StringType()),
                     StructField('Fine Amount',DoubleType(),True),
                     StructField('Penalty Amount',DoubleType(),True),
                     StructField('Interest Amount',DoubleType(),True),
                     StructField('Reduction Amount',DoubleType(),True),
                     StructField('Payment Amount',DoubleType(),True),
                     StructField('Amount Due',DoubleType(),True),
                     StructField('Precinct',IntegerType(),True),
                     StructField('County',StringType(),True),
                     StructField('Issuing Agency',StringType(),True),
                     StructField('Violation Status',StringType(),True),
                     StructField('Summons Image',StringType(),True)])


fine2 = spark.read.format("csv")\
  .option("header","true")\
  .schema(schema)\
  .load(dataPath)

fine2.printSchema()
#fine2.show(5)

root
 |-- Plate: string (nullable = true)
 |-- State: string (nullable = true)
 |-- License Type: string (nullable = true)
 |-- Summons Number: integer (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Violation Time: string (nullable = true)
 |-- Violation: string (nullable = true)
 |-- Judgment Entry Date: string (nullable = true)
 |-- Fine Amount: double (nullable = true)
 |-- Penalty Amount: double (nullable = true)
 |-- Interest Amount: double (nullable = true)
 |-- Reduction Amount: double (nullable = true)
 |-- Payment Amount: double (nullable = true)
 |-- Amount Due: double (nullable = true)
 |-- Precinct: integer (nullable = true)
 |-- County: string (nullable = true)
 |-- Issuing Agency: string (nullable = true)
 |-- Violation Status: string (nullable = true)
 |-- Summons Image: string (nullable = true)




Now that we've explored the data, let's return to understanding
**transformations** and **actions**.  
Let's create several transformations and then an action. After that we
will inspect exactly what's happening under the hood.

These transformations are simple, first we group by two variables and then compute the average fine.
Then we're going to inner join that to the original dataset on the column State.
Then we'll select the Violation Status from that new dataset.

In [9]:
# a simple grouping
df1 = fines.groupBy(["State","Violation"]).avg("Fine Amount") 

# a simple join and selecting some columns
df2 = df1 .join(fines, on=["Violation"], how='inner').select(["Plate","Violation","Violation Status"])

df1.show(10)
df2.show(10)

+-----+--------------------+----------------+
|State|           Violation|avg(Fine Amount)|
+-----+--------------------+----------------+
|   NY|NO STOPPING-DAY/T...|           115.0|
|   ME|NO PARKING-STREET...|            45.0|
|   IN|        TRAFFIC LANE|           115.0|
|   SC|  EXPIRED MUNI METER|            65.0|
|   NJ|PHTO SCHOOL ZN SP...|            50.0|
|   AZ|PHTO SCHOOL ZN SP...|            50.0|
|   NY|FAIL TO DSPLY MUN...|            47.0|
|   PA|NO STANDING-DAY/T...|           115.0|
|   NY|OBSTRUCTING TRAFF...|           115.0|
|   PA|STORAGE-3HR COMME...|            65.0|
+-----+--------------------+----------------+
only showing top 10 rows

+-------+--------------------+-------------------+
|  Plate|           Violation|   Violation Status|
+-------+--------------------+-------------------+
|1994439|    DETACHED TRAILER|    HEARING PENDING|
|GJK1149|           CROSSWALK|               null|
|GJK1149|           CROSSWALK|               null|
|HXP4226|REG. STICKER-EX

In [10]:
df2.explain()   

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [Plate#334, Violation#22, Violation Status#351]
   +- BroadcastHashJoin [Violation#22], [Violation#340], Inner, BuildLeft, false
      :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]),false), [id=#445]
      :  +- HashAggregate(keys=[State#17, Violation#22], functions=[])
      :     +- Exchange hashpartitioning(State#17, Violation#22, 200), ENSURE_REQUIREMENTS, [id=#442]
      :        +- HashAggregate(keys=[State#17, Violation#22], functions=[])
      :           +- Filter isnotnull(Violation#22)
      :              +- FileScan csv [State#17,Violation#22] Batched: false, DataFilters: [isnotnull(Violation#22)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/data/Open_Parking_and_Camera_Violations_100.csv], PartitionFilters: [], PushedFilters: [IsNotNull(Violation)], ReadSchema: struct<State:string,Violation:string>
      +- Filter isnotnull(Violation#340)
         +- F

In [11]:
# This will execute the plan that Apache Spark built up previously. 
df2.count()

271

In [12]:
# we can convert to Pandas IF the data can fit into one node
df2.to_pandas_on_spark()

23/02/13 10:29:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/13 10:29:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/13 10:29:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/13 10:29:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,Plate,Violation,Violation Status
0,1994439,DETACHED TRAILER,HEARING PENDING
1,GJK1149,CROSSWALK,
2,GJK1149,CROSSWALK,
3,HXP4226,REG. STICKER-EXPIRED/MISSING,HEARING HELD-GUILTY
4,1742308,DETACHED TRAILER,HEARING PENDING
5,RR2L42,NO STANDING-DAY/TIME LIMITS,HEARING PENDING
6,RR2L42,NO STANDING-DAY/TIME LIMITS,HEARING PENDING
7,RR2L42,NO STANDING-DAY/TIME LIMITS,HEARING PENDING
8,RR2L42,NO STANDING-DAY/TIME LIMITS,HEARING PENDING
9,46886MM,NO PARKING-DAY/TIME LIMITS,HEARING PENDING
