### Test to Compare & Contrast RDD & Dataframe

In [1]:
SPARK_MASTER = 'local[*]'
APPLICATION_NAME = 'explore'
PYTHON_2_LOCATION = "/usr/bin/python2.7"
DATAFILE = r'/home/jovyan/work/data/hamlet.txt'

In [2]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Explicitly define python 2 since we have both versions 2 & 3 installed
os.environ["PYSPARK_PYTHON"]=PYTHON_2_LOCATION

# Create spark context with which we will reference the Spark API
spark = (SparkSession
         .builder
         .master(SPARK_MASTER)
         .appName(APPLICATION_NAME)
         .getOrCreate())

spark.version

u'2.0.2'

In [3]:
# Word Count Methods 

import re
import json
from pyspark.sql.functions import regexp_replace, trim, col, lower, split,explode, desc

def removePunctuationCol(aStr): 
    return trim(lower(regexp_replace(regexp_replace(aStr, '\t', ' '), '[^ 0-9a-zA-Z]', '')))


def wordCount_DF(datafile):

    raw_df = spark.read.text(datafile)
    words = (raw_df
             .select(removePunctuationCol(col('value')).alias('value'))
             .filter(col('value') != '')
             .select(split(col('value'), ' ').alias('value'))
             .select(explode('value').alias('word')).filter(col('word') != '')
             .groupby('word').count()
             .sort(desc('count'))
            )
    return str(words.toJSON().collect())



def wordCount_RDD(datafile):

    words_rdd = spark.sparkContext.textFile(datafile)
    wordCount = (words_rdd
                 .map(lambda line: re.sub("[\t]", ' ', line))
                 .flatMap(lambda line: line.split(' '))
                 .filter(lambda w: w != '')
                 .map(lambda w: re.sub('[^a-z| |\'|0-9]', '', w.strip().lower()))
                 .map(lambda word: (word, 1))
                 .reduceByKey(lambda a, b: a+b)
                 .sortBy(lambda a: a[1],0)
                 .map(lambda (a, b): {'word': a, 'count': str(b)})
                )

    return json.dumps(wordCount.collect())



print 'Method definition complete'

Method definition complete


In [5]:
print '\n DF '
%time wc = wordCount_DF(DATAFILE)   
print wc[:200] 

print '\n RDD '
%time wc = wordCount_RDD(DATAFILE) 
print wc[:200] 




 DF 
CPU times: user 108 ms, sys: 16 ms, total: 124 ms
Wall time: 2.64 s
[u'{"word":"the","count":929}', u'{"word":"and","count":842}', u'{"word":"to","count":629}', u'{"word":"of","count":562}', u'{"word":"you","count":488}', u'{"word":"i","count":463}', u'{"word":"my","c

 RDD 
CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 694 ms
[{"count": "924", "word": "the"}, {"count": "841", "word": "and"}, {"count": "628", "word": "to"}, {"count": "562", "word": "of"}, {"count": "488", "word": "you"}, {"count": "442", "word": "i"}, {"cou
