In [1]:
# Check that SparkSession is available
spark

In [3]:
# Start H2OContext in internal backend
from pysparkling import *
hc = H2OContext.getOrCreate(spark)

Connecting to H2O server at http://172.16.2.17:54321... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster version:,3.16.0.1
H2O cluster version age:,6 days
H2O cluster name:,sparkling-water-kuba_local-1512088902004
H2O cluster total nodes:,1
H2O cluster free memory:,3.416 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://172.16.2.17:54321



Sparkling Water Context:
 * H2O name: sparkling-water-kuba_local-1512088902004
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,172.16.2.17,54321)
  ------------------------

  Open H2O Flow in browser: http://172.16.2.17:54321 (CMD + click in Mac OSX)

    


In [6]:
# Load 
import h2o
reviews_h2o = h2o.upload_file("Reviews.csv")
reviews_spark = hc.as_spark_frame(reviews_h2o)
schema = reviews_spark.schema



Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
# Drop unnecessary columns
from pyspark.ml.feature import SQLTransformer
colSelect = SQLTransformer(
    statement="SELECT Score, Time, Summary FROM __THIS__")

# Show slice of transformed data
selected = colSelect.transform(reviews_spark)
selected.take(10)

[Row(Score=5, Time=1303862400, Summary=u'Good Quality Dog Food'),
 Row(Score=1, Time=1346976000, Summary=u'Not as Advertised'),
 Row(Score=4, Time=1219017600, Summary=u'""Delight"" says it a'),
 Row(Score=2, Time=1307923200, Summary=u'Cough Medicine'),
 Row(Score=5, Time=1350777600, Summary=u'Great taffy'),
 Row(Score=4, Time=1342051200, Summary=u'Nice Taffy'),
 Row(Score=5, Time=1340150400, Summary=u'Great!  Just as good as the expensive brands!'),
 Row(Score=5, Time=1336003200, Summary=u'Wonderful, tasty taffy'),
 Row(Score=5, Time=1322006400, Summary=u'Yay Barley'),
 Row(Score=5, Time=1351209600, Summary=u'Healthy Dog Food')]

In [16]:
refineTime = SQLTransformer(
    statement="SELECT dayofweek(from_unixtime(Time) FROM __THIS__")

refined = refineTime.transform(selected)
refined.head()


Row(from_unixtime(CAST(Time AS BIGINT), yyyy-MM-dd HH:mm:ss)=u'2011-04-26 17:00:00')

In [6]:
# Convert data to spark Data Frame
df = hc.as_spark_frame(reviews)
# Show the schema
df.printSchema()

root
 |-- Id: integer (nullable = false)
 |-- ProductId: string (nullable = false)
 |-- UserId: string (nullable = false)
 |-- ProfileName: string (nullable = false)
 |-- HelpfulnessNumerator: short (nullable = false)
 |-- HelpfulnessDenominator: short (nullable = false)
 |-- Score: byte (nullable = false)
 |-- Time: integer (nullable = false)
 |-- Summary: string (nullable = false)
 |-- Text: string (nullable = false)
 |-- Day: byte (nullable = false)
 |-- Month: byte (nullable = false)
 |-- Year: short (nullable = false)
 |-- WeekNum: byte (nullable = false)
 |-- WeekDay: string (nullable = false)
 |-- HourOfDay: byte (nullable = false)
 |-- Weekend: byte (nullable = false)
 |-- Season: string (nullable = false)



In [7]:
from pyspark.sql.functions import *
# Calculate average score per Year
avg_score_per_year = hc.as_h2o_frame(df.groupBy("Year").agg(mean("Score"), count("Score")), "avgScorePerYear")

In [8]:
# Calculate average score per Month
avg_score_per_month = hc.as_h2o_frame(df.groupBy("Month").agg(mean("Score"), count("Score")), "avgScorePerMonth")

In [9]:
# Calculate average score per Day
avg_score_per_month = hc.as_h2o_frame(df.groupBy("WeekDay").agg(mean("Score"), count("Score")), "avgScorePerDay")

In [11]:
# Calculate input data frame for sentiment analysis
sentimentDF = hc.as_spark_frame(reviews[["Score", "Month", "Day", "WeekDay", "Summary"]])

In [None]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, udf
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, IDF, HashingTF


# Prepare data for the pipeline
toFloat = udf(lambda score: float(score), FloatType())
sentimentDF.where("Score != 3").withColumn("Score", toFloat(col("Score")))

# Define the pipeline stages
regexTokenizer = RegexTokenizer(inputCol="Summary",
                                outputCol="TokenizedSummary",
                                pattern="[, ]",
                                toLowercase=True)

stopWordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(),
                                    outputCol="CleanedSummary",
                                    caseSensitive=False)
## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)


## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=1)

from pysparkling.ml import ColumnPruner, H2OGBM

## Create GBM model
gbm = H2OGBM(ratio=0.8,
             featuresCols=[idf.getOutputCol()],
             predictionCol="Summary")

## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), regexTokenizer.getOutputCol()])

##  Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, hashingTF, idf, gbm, colPruner])

## Train the pipeline model
model = pipeline.fit(sentimentDF)

model


In [45]:
model.transform(sentimentDF).take(10)


[Row(Score=5, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Good Quality Dog Food', value=4.054493898772716),
 Row(Score=1, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Not as Advertised', value=4.054493898772716),
 Row(Score=4, Month=1, Day=15, WeekDay=u'Thu', Summary=u'""Delight"" says it a', value=4.054493898772716),
 Row(Score=2, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Cough Medicine', value=4.054493898772716),
 Row(Score=5, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Great taffy', value=4.613358328249387),
 Row(Score=4, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Nice Taffy', value=4.054493898772716),
 Row(Score=5, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Great!  Just as good as the expensive brands!', value=4.18077098095041),
 Row(Score=5, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Wonderful, tasty taffy', value=4.502975996597751),
 Row(Score=5, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Yay Barley', value=4.054493898772716),
 Row(Score=5, Month=1, Day=16, WeekDay=u'Fri', Summary=u

In [43]:
sentimentDF.head()

Row(Score=5, Month=1, Day=16, WeekDay=u'Fri', Summary=u'Good Quality Dog Food')