# Spark Recitation

![title](img/chopsticks_title.png)

![title](img/how_to_use_chopsticks.png)

# Initialize Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.appName('Spark_Rec').getOrCreate()

Exception: Java gateway process exited before sending the driver its port number

# Reading In and Exploring Data

In [None]:
chopsticks_sdf = spark.read.option('header', 'true').csv('data/chopstick-effectiveness.csv')

In [None]:
chopsticks_sdf.show(5)

![title](img/arrangement.png)

In [None]:
chopsticks_sdf.count()

In [None]:
# Doesn't work. Why?
chopsticks_sdf.describe().show()

In [None]:
chopsticks_sdf.printSchema()

# Jupyter Command and Edit Modes and Shift + Tab for Documentation

# Projecting Columns

In [None]:
chopsticks_sdf['Individual']

In [None]:
chopsticks_sdf.Individual

In [None]:
chopsticks_sdf[['Individual']]

In [None]:
# number of individuals in experiment
chopsticks_sdf[['Individual']].distinct().count()

In [None]:
# lengths of chopsticks being tested
chopsticks_sdf[['`Chopstick.Length`']].distinct().show()

# Renaming Columns

In [None]:
chopsticks_sdf = chopsticks_sdf.withColumnRenamed('Food.Pinching.Effeciency', 'efficiency') \
                               .withColumnRenamed('Individual', 'individual') \
                               .withColumnRenamed('Chopstick.Length', 'length')
chopsticks_sdf.show(5)

In [None]:
# many ways of doing this
chopsticks_sdf.selectExpr('efficiency AS eff', 'individual AS ind', 'length').show(5)

In [None]:
chopsticks_sdf.createOrReplaceTempView('chopsticks_view')
spark.sql('SELECT efficiency AS eff, individual AS ind, length AS len FROM chopsticks_view').columns

In [None]:
# get multiple columns now that we have names that don't require backticks
chopsticks_sdf[['individual', 'efficiency']].show(5)

# Changing Columns Types

In [None]:
chopsticks_sdf = chopsticks_sdf.selectExpr('CAST(efficiency AS double)',
                                           'CAST(individual AS int)',
                                           'CAST(length AS int)')

In [None]:
chopsticks_sdf.printSchema()

In [None]:
chopsticks_sdf.describe().show()

# Creating Schema and Reading in Data with Schema

In [None]:
# uses pyspark.sql.types
schema = StructType([StructField("efficiency", DoubleType()),
                     StructField("individual", IntegerType()),
                     StructField("length", IntegerType())])
chopsticks2_sdf = spark.read.schema(schema).option('header', 'true').csv('data/chopstick-effectiveness.csv')
chopsticks2_sdf.show(5)
chopsticks2_sdf.printSchema()

# Computing Mean Efficiency by Chopstick Length

In [None]:
chopsticks_sdf.groupBy('length').mean('efficiency').orderBy('avg(efficiency)', ascending=False).show()

In [None]:
# Looks a bit strange. Why?
spark.sql("""SELECT length, AVG(efficiency)
             FROM chopsticks_view
             GROUP BY length
             ORDER BY AVG(efficiency) DESC""").show()

# Computing Mean Efficiency by Individual and Joining to get Names

In [None]:
# read in names
schema = StructType([StructField("id", IntegeraType()),
                     StructField("name", StringType())])
individuals_sdf = spark.read.schema(schema).option('header', 'true').csv('data/individual-names.csv')

In [None]:
chopsticks_sdf.join(individuals_sdf, chopsticks_sdf.individual == individuals_sdf.id) \
              .groupBy('individual', 'name').mean('efficiency').orderBy('avg(efficiency)', ascending=False) \
              .select('name', 'avg(efficiency)') \
              .show(10)

In [None]:
%%time
individuals_sdf.createOrReplaceTempView('names_view')
spark.sql("""SELECT name, AVG(efficiency) AS avg_efficiency
             FROM chopsticks_view
             JOIN names_view
             ON chopsticks_view.individual = names_view.id
             GROUP BY individual, name
             ORDER BY avg_efficiency DESC""").show(10)