## Figuring out Spark 2.0

aka the *Decimal Project*

### Notes/Discoveries

* dates are a real headache

### References / REad

* http://www.agildata.com/apache-spark-rdd-vs-dataframe-vs-dataset/
* https://databricks.com/blog/2016/01/04/introducing-apache-spark-datasets.html
* http://blog.cloudera.com/blog/2015/07/how-to-do-data-quality-checks-using-apache-spark-dataframes/
* https://www.infoq.com/articles/apache-spark-sql
* http://blog.brakmic.com/data-science-for-losers-part-5-spark-dataframes/


In [1]:
##### CELL A #####

# Basic setup required in all notebooks
 
appName='archetest'
master='local[*]' #local spark-master

# Explicitly define python 2 since we have both 2 & 3 installed
import os
os.environ['PYSPARK_PYTHON'] = '/opt/conda/envs/python2/bin/python'

# Establish spark context or session
#from pyspark import SparkContext, SparkConf
#conf = SparkConf().setAppName(appName).setMaster(master)
#sc = SparkContext(conf=conf)


# A SparkSession can be created using a builder pattern
from pyspark.sql import SparkSession
spark = (SparkSession
         .builder
         .master(master)
         .appName(appName)
         .getOrCreate())

In [2]:
##### CELL A-2 #####

# do something to prove it works

# Old way
#rdd = sc.parallelize(range(1000))
#rdd.takeSample(False, 5)

# Spark 2
# ok- rdd's are still available, but the spark dataframe/set is now the preferred data structure, 
# albeit a bit more complex.  For example, you don't create a dataframe or rdd from a random  
# list, you create a range that is ALREADY a dataframe.

r = spark.range(10) #creates a dataframe/dataset
print type(r),r
r.sample(False,.5)
r.take(5)

<class 'pyspark.sql.dataframe.DataFrame'> DataFrame[id: bigint]


[Row(id=0), Row(id=1), Row(id=2), Row(id=3), Row(id=4)]

In [3]:
# Somewhat Handy methods

def prSomeDS(inputDS,howMany,msg=None):
    """
    Debugging shorthand

    Args: A Spark 2 DataSet, how many items to print, and optional message

    Returns: None
        
    """
    print msg
    #inputDS.take(howMany).foreach(println)  #doesn't work with all spark2 datasets
    for x in inputDS.take(howMany):
        print x    


In [4]:
#stop here  # halt execution with an error

## File Access

In [5]:
##### CELL B #####
# read local file into a dataset

datafile_local = "/home/jovyan/work/fotd.csv"

#dataRDD = sc.textFile(datafile_local)  #old way

dataDS = spark.read.text(datafile_local) # spark2
#spark2 type is <class 'pyspark.sql.dataframe.DataFrame'>
print type(dataDS)
dataDS.take(2)

<class 'pyspark.sql.dataframe.DataFrame'>


[Row(value=u'sheet,Date,Event,Lesson,attendees_reg,attendees_student,attendees_special,attendees_board,addendees_free,addendees_coupon,addendees_comp,fee_reg,fee_student,fee_special,fee_board,donations,caller,band'),
 Row(value=u'4/16/2016,4/16/2016,Contra Dance Financial Worksheet,-1,37,11,0,4,6,0,6,$9.00,$5.00,$8.00,$9.00,$37.00,Patricia Dancen,Steam')]

In [6]:
##### CELL C #####

# Define the datafile and establish HDFS access, spark 2

datafile_HDFS = r'hdfs://172.18.0.2:9000/user/root/testData/fotd.csv'
d = spark.read.text(datafile_HDFS)
print type(d)
prSomeDS(d,2,'fotd sample from hdfs store')


<class 'pyspark.sql.dataframe.DataFrame'>
fotd sample from hdfs store
Row(value=u'sheet,Date,Event,Lesson,attendees_reg,attendees_student,attendees_special,attendees_board,addendees_free,addendees_coupon,addendees_comp,fee_reg,fee_student,fee_special,fee_board,donations,caller,band')
Row(value=u'4/16/2016,4/16/2016,Contra Dance Financial Worksheet,-1,37,11,0,4,6,0,6,$9.00,$5.00,$8.00,$9.00,$37.00,Patricia Dancen,Steam')


In [7]:
# Read as default csv file
d = spark.read.csv(datafile_HDFS, header = "true")
print type(d)
d.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- sheet: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- Lesson: string (nullable = true)
 |-- attendees_reg: string (nullable = true)
 |-- attendees_student: string (nullable = true)
 |-- attendees_special: string (nullable = true)
 |-- attendees_board: string (nullable = true)
 |-- addendees_free: string (nullable = true)
 |-- addendees_coupon: string (nullable = true)
 |-- addendees_comp: string (nullable = true)
 |-- fee_reg: string (nullable = true)
 |-- fee_student: string (nullable = true)
 |-- fee_special: string (nullable = true)
 |-- fee_board: string (nullable = true)
 |-- donations: string (nullable = true)
 |-- caller: string (nullable = true)
 |-- band: string (nullable = true)



## Data Conversion
Get Data into a useable format

In [8]:
##### Cell Data Conversion-A #####

from pyspark.sql.functions import regexp_extract, regexp_replace, trim, col, lower, to_date

# extraction patterns
re_us_currency = r'(\d*\.\d\d)'
re_integer = r'(\d*)'
re_float = r'(\d*\.?\d*)'

In [9]:
##### Cell Data Conversion-B #####

# Transform into new dataset/dataframe with appropriate data types
ds = dataDS.select(to_date(regexp_replace(d.Date,r'/',r'-')).alias('newdate'),
                   lower(d.Event).alias('event'),
                   lower(d.caller).alias('caller'),
                   lower(d.band).alias('band'),
                   regexp_extract(d.Lesson,re_integer,1).alias('inlesson').cast("int"), 
                   regexp_extract(d.attendees_reg,re_integer,1).alias('attendees_reg').cast("int"), 
                   regexp_extract(d.attendees_student,re_integer,1).alias('attendees_student').cast("int"), 
                   regexp_extract(d.attendees_special,re_integer,1).alias('attendees_special').cast("int"), 
                   regexp_extract(d.attendees_board,re_integer,1).alias('attendees_board').cast("int"), 
                   regexp_extract(d.addendees_coupon,re_integer,1).alias('addendees_coupon').cast("int"), 
                   regexp_extract(d.addendees_coupon,re_integer,1).alias('addendees_coupon').cast("int"), 
                   regexp_extract(d.fee_reg,re_us_currency,1).alias("fee_reg").cast("decimal(10,2)"),
                   regexp_extract(d.fee_student,re_us_currency,1).alias("fee_student").cast("decimal(10,2)"),
                   regexp_extract(d.fee_special,re_us_currency,1).alias("fee_special").cast("decimal(10,2)"),
                   regexp_extract(d.fee_board,re_us_currency,1).alias("fee_board").cast("decimal(10,2)"),
                   regexp_extract(d.donations,re_us_currency,1).alias("donations").cast("decimal(10,2)")
                  )
              
ds.printSchema()
#prSomeDS(ds,3,"transformed data")
ds.show(3)


root
 |-- newdate: date (nullable = true)
 |-- event: string (nullable = true)
 |-- caller: string (nullable = true)
 |-- band: string (nullable = true)
 |-- inlesson: integer (nullable = true)
 |-- attendees_reg: integer (nullable = true)
 |-- attendees_student: integer (nullable = true)
 |-- attendees_special: integer (nullable = true)
 |-- attendees_board: integer (nullable = true)
 |-- addendees_coupon: integer (nullable = true)
 |-- addendees_coupon: integer (nullable = true)
 |-- fee_reg: decimal(10,2) (nullable = true)
 |-- fee_student: decimal(10,2) (nullable = true)
 |-- fee_special: decimal(10,2) (nullable = true)
 |-- fee_board: decimal(10,2) (nullable = true)
 |-- donations: decimal(10,2) (nullable = true)

+-------+--------------------+---------------+--------------------+--------+-------------+-----------------+-----------------+---------------+----------------+----------------+-------+-----------+-----------+---------+---------+
|newdate|               event|         cal

In [10]:
#once I have the data, do something with it


In [11]:
# what CAN I do with this spark session object?
dir(spark)

['Builder',
 '__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__enter__',
 '__exit__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_conf',
 '_createFromLocal',
 '_createFromRDD',
 '_inferSchema',
 '_inferSchemaFromList',
 '_instantiatedContext',
 '_jsc',
 '_jsparkSession',
 '_jvm',
 '_jwrapped',
 '_sc',
 '_wrapped',
 'builder',
 'catalog',
 'conf',
 'createDataFrame',
 'newSession',
 'range',
 'read',
 'readStream',
 'sparkContext',
 'sql',
 'stop',
 'streams',
 'table',
 'udf',
 'version']