## Figuring out Spark 2.0

aka the *Decimal Project*

### Notes/Discoveries



In [1]:
##### CELL A #####

# Basic setup required in all notebooks
 
appName='archetest'
master='local[*]' #local spark-master

# Explicitly define python 2 since we have both 2 & 3 installed
import os
os.environ['PYSPARK_PYTHON'] = '/opt/conda/envs/python2/bin/python'

# Establish spark context or session
#from pyspark import SparkContext, SparkConf

#conf = SparkConf().setAppName(appName).setMaster(master)
#sc = SparkContext(conf=conf)


# A SparkSession can be created using a builder pattern
from pyspark.sql import SparkSession
spark = (SparkSession
         .builder
         .master(master)
         .appName(appName)
         .getOrCreate())

In [2]:
# do something to prove it works

# Old way
#rdd = sc.parallelize(range(1000))
#rdd.takeSample(False, 5)

# Spark 2
# ok- rdd's are still available, but the spark dataframe/set is now the preferred data structure, 
# albeit a bit more complex.  For example, you don't create a dataframe or rdd from a random  
# list, you create a range that is ALREADY a dataframe.
r = spark.range(10) #creates a dataframe/dataset
print type(r),r
r.sample(False,.5)
r.take(5)

<class 'pyspark.sql.dataframe.DataFrame'> DataFrame[id: bigint]


[Row(id=0), Row(id=1), Row(id=2), Row(id=3), Row(id=4)]

In [3]:
# Handy methods

def prSomeDS(inputDS,howMany,msg=None):
    """
    Debugging shorthand

    Args: A Spark 2 DataSet, how many items to print, and optional message

    Returns: None
        
    """
    print msg
    #inputDS.take(howMany).foreach(println)
    for x in inputDS.take(howMany):
        print x    


In [4]:
#stop here  # halt execution with an error

## File Access

In [5]:
##### CELL B #####

# read local file into RDD
datafile_local = "/home/jovyan/work/fotd.csv"
#d = sc.textFile(datafile_local)  #old way
d = spark.read.text(datafile_local) # spark2
#spark2 type is <class 'pyspark.sql.dataframe.DataFrame'>
print type(d)
d.take(2)

<class 'pyspark.sql.dataframe.DataFrame'>


[Row(value=u'sheet,Date,Event,Lesson,attendees_reg,attendees_student,attendees_special,attendees_board,addendees_free,addendees_coupon,addendees_comp,fee_reg,fee_student,fee_special,fee_board,donations,caller,band'),
 Row(value=u'4/16/2016,4/16/2016,Contra Dance Financial Worksheet,-1,37,11,0,4,6,0,6,$9.00,$5.00,$8.00,$9.00,$37.00,Patricia Dancen,Steam')]

In [6]:
##### CELL C #####

# Define the datafile and establish HDFS access, spark 2

datafile_HDFS = r'hdfs://172.18.0.2:9000/user/root/testData/fotd.csv'
d = spark.read.text(datafile_HDFS)
print type(d)
prSomeDS(d,2,'fotd sample from hdfs store')


<class 'pyspark.sql.dataframe.DataFrame'>
fotd sample from hdfs store
Row(value=u'sheet,Date,Event,Lesson,attendees_reg,attendees_student,attendees_special,attendees_board,addendees_free,addendees_coupon,addendees_comp,fee_reg,fee_student,fee_special,fee_board,donations,caller,band')
Row(value=u'4/16/2016,4/16/2016,Contra Dance Financial Worksheet,-1,37,11,0,4,6,0,6,$9.00,$5.00,$8.00,$9.00,$37.00,Patricia Dancen,Steam')


In [7]:

d = spark.read.csv(datafile_HDFS, header = "true")
print type(d)
d.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- sheet: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- Lesson: string (nullable = true)
 |-- attendees_reg: string (nullable = true)
 |-- attendees_student: string (nullable = true)
 |-- attendees_special: string (nullable = true)
 |-- attendees_board: string (nullable = true)
 |-- addendees_free: string (nullable = true)
 |-- addendees_coupon: string (nullable = true)
 |-- addendees_comp: string (nullable = true)
 |-- fee_reg: string (nullable = true)
 |-- fee_student: string (nullable = true)
 |-- fee_special: string (nullable = true)
 |-- fee_board: string (nullable = true)
 |-- donations: string (nullable = true)
 |-- caller: string (nullable = true)
 |-- band: string (nullable = true)



In [38]:
from pyspark.sql.functions import regexp_replace, trim, col, lower
def returnCurrency(column):
    """Returns only legal us currency values from input column

    Note:

    Args:
        column (Column): A Column containing a string.

    Returns:
        Column: A Column 
    """

    return (regexp_replace(column, '^([0-9]*\.[0-9][0-9])', ''))

In [39]:

ds = d.select(d.sheet, d.Date, d.attendees_reg.cast("double"), returnCurrency(d.fee_reg).alias("nip")) #cast("decimal(10,2)"))
ds.printSchema()
prSomeDS(ds,3,"transformed data")
prSomeDS(d,3,"original data")

root
 |-- sheet: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- attendees_reg: double (nullable = true)
 |-- nip: string (nullable = true)

transformed data
Row(sheet=u'4/16/2016', Date=u'4/16/2016', attendees_reg=37.0, nip=u'$9.00')
Row(sheet=u'4/2/2016', Date=u'4/2/2016', attendees_reg=41.0, nip=u'$9.00')
Row(sheet=u'3/19/2016', Date=u'3/19/2016', attendees_reg=56.0, nip=u'$9.00')
original data
Row(sheet=u'4/16/2016', Date=u'4/16/2016', Event=u'Contra Dance Financial Worksheet', Lesson=u'-1', attendees_reg=u'37', attendees_student=u'11', attendees_special=u'0', attendees_board=u'4', addendees_free=u'6', addendees_coupon=u'0', addendees_comp=u'6', fee_reg=u'$9.00', fee_student=u'$5.00', fee_special=u'$8.00', fee_board=u'$9.00', donations=u'$37.00', caller=u'Patricia Dancen', band=u'Steam')
Row(sheet=u'4/2/2016', Date=u'4/2/2016', Event=u'Contra Dance Financial Worksheet', Lesson=u'0', attendees_reg=u'41', attendees_student=u'6', attendees_special=u'0', attendees_boa

In [9]:
#once I have the data, do something with it
