In [1]:
# Check that SparkSession is available
spark

In [2]:
# Start H2OContext in internal backend
from pysparkling import *
hc = H2OContext.getOrCreate(spark)

Connecting to H2O server at http://172.16.2.17:54321... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster version:,3.16.0.1
H2O cluster version age:,6 days
H2O cluster name:,sparkling-water-kuba_local-1512095833528
H2O cluster total nodes:,1
H2O cluster free memory:,3.416 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://172.16.2.17:54321



Sparkling Water Context:
 * H2O name: sparkling-water-kuba_local-1512095833528
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,172.16.2.17,54321)
  ------------------------

  Open H2O Flow in browser: http://172.16.2.17:54321 (CMD + click in Mac OSX)

    


In [3]:
# Load data using H2O because we belive in our CSV parser correctly identifies the schema
import h2o
reviews_h2o = h2o.upload_file("Reviews.csv")

Parse progress: |███████████████████████████████████████████████████████████| 100%


In [4]:
# Convert to Spark frame for the input to the pipeline
reviews_spark = hc.as_spark_frame(reviews_h2o)

In [5]:
# Save the origianl spark schema. We will reuse this schema during online streaming for each row on the input
reviews_spark.printSchema()

f = open('schema.txt','w')
f.write(str(reviews_spark.schema))
f.close()

root
 |-- Id: integer (nullable = false)
 |-- ProductId: string (nullable = false)
 |-- UserId: string (nullable = false)
 |-- ProfileName: string (nullable = false)
 |-- HelpfulnessNumerator: short (nullable = false)
 |-- HelpfulnessDenominator: short (nullable = false)
 |-- Score: byte (nullable = false)
 |-- Time: integer (nullable = false)
 |-- Summary: string (nullable = false)
 |-- Text: string (nullable = false)



In [6]:
# Now we define all the stages for the pipeline

# Drop unnecessary columns
from pyspark.ml.feature import SQLTransformer
colSelect = SQLTransformer(
    statement="SELECT Score, from_unixtime(Time) as Time, Summary FROM __THIS__")

# Show slice of transformed data
selected = colSelect.transform(reviews_spark)
selected.take(10)

[Row(Score=5, Time=u'2011-04-26 17:00:00', Summary=u'Good Quality Dog Food'),
 Row(Score=1, Time=u'2012-09-06 17:00:00', Summary=u'Not as Advertised'),
 Row(Score=4, Time=u'2008-08-17 17:00:00', Summary=u'""Delight"" says it a'),
 Row(Score=2, Time=u'2011-06-12 17:00:00', Summary=u'Cough Medicine'),
 Row(Score=5, Time=u'2012-10-20 17:00:00', Summary=u'Great taffy'),
 Row(Score=4, Time=u'2012-07-11 17:00:00', Summary=u'Nice Taffy'),
 Row(Score=5, Time=u'2012-06-19 17:00:00', Summary=u'Great!  Just as good as the expensive brands!'),
 Row(Score=5, Time=u'2012-05-02 17:00:00', Summary=u'Wonderful, tasty taffy'),
 Row(Score=5, Time=u'2011-11-22 16:00:00', Summary=u'Yay Barley'),
 Row(Score=5, Time=u'2012-10-25 17:00:00', Summary=u'Healthy Dog Food')]

In [7]:
# Create More human readable date columns
refineTime = SQLTransformer(
    statement="SELECT Score, Summary, dayofmonth(Time) as Day, month(Time) as Month, year(Time) as Year, weekofyear(Time) as WeekNum, date_format(Time, 'EEE') as WeekDay, hour(Time) as HourOfDay, IF(date_format(Time, 'EEE')='Sat' OR date_format(Time, 'EEE')='Sun', 1, 0) as Weekend, CASE WHEN month(TIME)=12 OR month(Time)<=2 THEN 'Winter' WHEN month(TIME)>=3 OR month(Time)<=5 THEN 'Spring' WHEN month(TIME)>=6 AND month(Time)<=9 THEN 'Summer' ELSE 'Autumn' END as Seasson FROM __THIS__")

refined = refineTime.transform(selected)
refined.head()



Row(Score=5, Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring')

In [8]:
# Show the schema
refined.printSchema()

root
 |-- Score: byte (nullable = false)
 |-- Summary: string (nullable = false)
 |-- Day: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- WeekNum: integer (nullable = true)
 |-- WeekDay: string (nullable = true)
 |-- HourOfDay: integer (nullable = true)
 |-- Weekend: integer (nullable = false)
 |-- Seasson: string (nullable = false)



In [9]:
# Remove neutral reviews and classify the Scores
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, udf
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, IDF, CountVectorizer

filterScore = SQLTransformer(
    statement="SELECT IF(Score<3,'NEGATIVE', 'POSITIVE') as Sentiment, Summary, Day, Month, Year, WeekNum, WeekDay, HourOfDay, Weekend, Seasson FROM __THIS__ WHERE Score !=3 ")

filtered = filterScore.transform(refined)
filtered.take(10)

[Row(Sentiment=u'POSITIVE', Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring'),
 Row(Sentiment=u'NEGATIVE', Summary=u'Not as Advertised', Day=6, Month=9, Year=2012, WeekNum=36, WeekDay=u'Thu', HourOfDay=17, Weekend=0, Seasson=u'Spring'),
 Row(Sentiment=u'POSITIVE', Summary=u'""Delight"" says it a', Day=17, Month=8, Year=2008, WeekNum=33, WeekDay=u'Sun', HourOfDay=17, Weekend=1, Seasson=u'Spring'),
 Row(Sentiment=u'NEGATIVE', Summary=u'Cough Medicine', Day=12, Month=6, Year=2011, WeekNum=23, WeekDay=u'Sun', HourOfDay=17, Weekend=1, Seasson=u'Spring'),
 Row(Sentiment=u'POSITIVE', Summary=u'Great taffy', Day=20, Month=10, Year=2012, WeekNum=42, WeekDay=u'Sat', HourOfDay=17, Weekend=1, Seasson=u'Spring'),
 Row(Sentiment=u'POSITIVE', Summary=u'Nice Taffy', Day=11, Month=7, Year=2012, WeekNum=28, WeekDay=u'Wed', HourOfDay=17, Weekend=0, Seasson=u'Spring'),
 Row(Sentiment=u'POSITIVE', Summary=u'Great!  Just as 

In [10]:
# Tokenize the message
regexTokenizer = RegexTokenizer(inputCol="Sentiment",
                                outputCol="TokenizedSummary",
                                pattern="[, ]",
                                toLowercase=True)

In [11]:
# Remove unnecessary words
stopWordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(),
                                    outputCol="CleanedSummary",
                                    caseSensitive=False)

In [12]:
# Hash the words
countVectorizer = CountVectorizer(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex")

In [13]:
# Create inverse document frequencies model
idf = IDF(inputCol=countVectorizer.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=1)

In [14]:
from pysparkling.ml import ColumnPruner, H2OGBM

# Create GBM model
gbm = H2OGBM(ratio=0.8,
             featuresCols=[idf.getOutputCol()],
             predictionCol="Sentiment")

In [15]:
# Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), countVectorizer.getOutputCol(), stopWordsRemover.getOutputCol(), regexTokenizer.getOutputCol()])

In [16]:
#  Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[colSelect, refineTime, filterScore, regexTokenizer, stopWordsRemover, countVectorizer, idf, gbm, colPruner])

In [17]:
# Train the pipeline model
model = pipeline.fit(reviews_spark)

In [18]:
# Try predicting on the same input data
model.transform(reviews_spark).take(10)


[Row(Sentiment=u'POSITIVE', Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring', NEGATIVE=0.0010524337424877572, POSITIVE=0.9989475662575122),
 Row(Sentiment=u'NEGATIVE', Summary=u'Not as Advertised', Day=6, Month=9, Year=2012, WeekNum=36, WeekDay=u'Thu', HourOfDay=17, Weekend=0, Seasson=u'Spring', NEGATIVE=0.994873070841184, POSITIVE=0.0051269291588159115),
 Row(Sentiment=u'POSITIVE', Summary=u'""Delight"" says it a', Day=17, Month=8, Year=2008, WeekNum=33, WeekDay=u'Sun', HourOfDay=17, Weekend=1, Seasson=u'Spring', NEGATIVE=0.0010524337424877572, POSITIVE=0.9989475662575122),
 Row(Sentiment=u'NEGATIVE', Summary=u'Cough Medicine', Day=12, Month=6, Year=2011, WeekNum=23, WeekDay=u'Sun', HourOfDay=17, Weekend=1, Seasson=u'Spring', NEGATIVE=0.994873070841184, POSITIVE=0.0051269291588159115),
 Row(Sentiment=u'POSITIVE', Summary=u'Great taffy', Day=20, Month=10, Year=2012, WeekNum=42, WeekDay=u'Sat', HourOfDay

In [19]:
# Save the pipeline model
model.save("Pipeline.model")