In [2]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

In [3]:
# Build a SparkSession using the SparkSession APIs.
# If one does not exist, then create an instance. There
# can only be one SparkSession per JVM.
spark = (SparkSession.builder.appName("PythonMnMCount").getOrCreate())

22/11/20 15:13:37 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
mnm_file = "./mnm_dataset.csv"

In [10]:
!head -5 {mnm_file}

State,Color,Count
NM,Orange,50
NM,Blue,86
UT,Green,68
NM,Orange,77


In [6]:
# Read the file into a Spark DataFrame using the CSV
# format by inferring the schema and specifying that the
# file contains a header, which provides column names for comma-
# separated fields.
mnm_df = (spark.read.format("csv").option("header", "true").option(
    "inferSchema", "true").load(mnm_file))
print(mnm_df)

DataFrame[State: string, Color: string, Count: int]


In [15]:
# We use the DataFrame high-level APIs. Note
# that we don't use RDDs at all. Because some of Spark's
# functions return the same object, we can chain function calls.
# 1. Select from the DataFrame the fields "State", "Color", and "Count"
# 2. Since we want to group each state and its M&M color count, we use groupBy()
# 3.  Aggregate counts of all colors and groupBy() State and Color
# 4 orderBy() in descending order
count_mnm_df = (mnm_df.select("State", "Color", "Count").groupBy(
    "State",
    "Color").agg(count("Count").alias("Total")).orderBy("Total",
                                                        ascending=False))
# Show the resulting aggregations for all the states and colors; # a total count of each color per state.
# Note show() is an action, which will trigger the above
# query to be executed.
#count_mnm_df.show(n=60, truncate=False)
print("Total Rows = %d" % (count_mnm_df.count()))

22/11/20 15:27:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: <!DOCTYPE html>, 
 Schema: State, Color
Expected: State but found: <!DOCTYPE html>
CSV file: file:///data/mnm_dataset.csv
Total Rows = 15
