# DataFrame Operations



In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import*

spark = SparkSession.builder.appName('DataFrame Operations').getOrCreate()
sc = spark.sparkContext

In [None]:
# Read a CSV into a dataframe, inferring the schema.
dataPath = "../data/Open_Parking_and_Camera_Violations_100.csv"
fines = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema", "true")\
  .load(dataPath)
  
# inferSchema means we will automatically figure out column types 
# at a cost of reading the data more than once

In [None]:
fines.printSchema()

In [None]:
# A schema is a StructType made up of a number of fields, StructFields, that have a name, type, and a Boolean flag which specifies whether that column can contain missing or null values.
fines.schema

In [None]:
fines.show(2)

In [None]:
fines.select(['Plate', 'Violation']).show(5)

Let's see how to manually specify a known schema for a data file, so we can skip the costly "Infer Schema":



In [None]:
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType

schema = StructType([StructField('Plate',StringType(), True),
                     StructField('State',StringType(),True),
                     StructField('License Type',StringType()),
                     StructField('Summons Number',IntegerType()),
                     StructField('Issue Date',StringType()),
                     StructField('Violation Time',StringType()),
                     StructField('Violation',StringType()),
                     StructField('Judgment Entry Date',StringType()),
                     StructField('Fine Amount',DoubleType(),True),
                     StructField('Penalty Amount',DoubleType(),True),
                     StructField('Interest Amount',DoubleType(),True),
                     StructField('Reduction Amount',DoubleType(),True),
                     StructField('Payment Amount',DoubleType(),True),
                     StructField('Amount Due',DoubleType(),True),
                     StructField('Precinct',IntegerType(),True),
                     StructField('County',StringType(),True),
                     StructField('Issuing Agency',StringType(),True),
                     StructField('Violation Status',StringType(),True),
                     StructField('Summons Image',StringType(),True)])


fine2 = spark.read.format("csv")\
  .option("header","true")\
  .schema(schema)\
  .load(dataPath)

fine2.printSchema()
#fine2.show(5)


Now that we've explored the data, let's return to understanding
**transformations** and **actions**.  
Let's create several transformations and then an action. After that we
will inspect exactly what's happening under the hood.

These transformations are simple, first we group by two variables and then compute the average fine.
Then we're going to inner join that to the original dataset on the column State.
Then we'll select the Violation Status from that new dataset.

In [None]:
# a simple grouping
df1 = fines.groupBy(["State","Violation"]).avg("Fine Amount") 

# a simple join and selecting some columns
df2 = df1 .join(fines, on=["Violation"], how='inner').select(["Plate","Violation","Violation Status"])

df1.show(10)
df2.show(10)

In [None]:
df2.explain()   

In [None]:
# This will execute the plan that Apache Spark built up previously. 
df2.count()

In [None]:
# we can convert to Pandas IF the data can fit into one node
df2.to_pandas_on_spark()