In [1]:
# Check that SparkSession is available. We do not need to explicitly create SparkSession as it is created for us
# automatically during start of the Jupiter notebook. This works because of the Jupiter is set up with Spark kernel.
spark

In [2]:
# Start H2OContext in the internal backend.

# This call initializes H2O on each Spark executors in the Spark cluster.
from pysparkling import *
hc = H2OContext.getOrCreate(spark)

Connecting to H2O server at http://172.16.2.43:54323... successful.


0,1
H2O cluster uptime:,11 secs
H2O cluster version:,3.16.0.2
H2O cluster version age:,23 hours and 40 minutes
H2O cluster name:,sparkling-water-kuba_local-1512169151902
H2O cluster total nodes:,1
H2O cluster free memory:,6.975 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://172.16.2.43:54323



Sparkling Water Context:
 * H2O name: sparkling-water-kuba_local-1512169151902
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,172.16.2.43,54323)
  ------------------------

  Open H2O Flow in browser: http://172.16.2.43:54323 (CMD + click in Mac OSX)

    


In [3]:
# Load data using H2O because we belive in our CSV parser correctly identifies the schema
import h2o
reviews_h2o = h2o.upload_file("Reviews.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
# Convert to Spark frame for the input to the pipeline
reviews_spark = hc.as_spark_frame(reviews_h2o)

In [5]:
# Save the original spark schema. We will reuse this schema during online streaming for each row on the input. 
# This can be handy as during data streaming, it can be hard to correctly identify the type of the data on the input 
# because it is just a few rows. This allows us to use the schema from the training time during the production time to
# maximize the best results

reviews_spark.printSchema()

f = open('schema.json','w')
f.write(str(reviews_spark.schema.json()))
f.close()

root
 |-- Id: integer (nullable = false)
 |-- ProductId: string (nullable = false)
 |-- UserId: string (nullable = false)
 |-- ProfileName: string (nullable = false)
 |-- HelpfulnessNumerator: short (nullable = false)
 |-- HelpfulnessDenominator: short (nullable = false)
 |-- Score: byte (nullable = false)
 |-- Time: integer (nullable = false)
 |-- Summary: string (nullable = false)
 |-- Text: string (nullable = false)



In [6]:
# Now we define all the stages for the pipeline
# The pipeline stages are not executed right away, the are executed during each fit and transform call.

# Define transformer to drop unnecessary columns
# Here, we use SQLTransformer which allows us to transform data using Spark SQL.

# As part of this transformer, we convert timestamp to the human readable date string
from pyspark.ml.feature import SQLTransformer
colSelect = SQLTransformer(
    statement="SELECT Score, from_unixtime(Time) as Time, Summary FROM __THIS__")

# The pipeline transformers are automatically invoked during the pipeline execution, however we can also call them
# directly just to see the intermediate results
selected = colSelect.transform(reviews_spark)
selected.take(10)

[Row(Score=5, Time=u'2011-04-26 17:00:00', Summary=u'Good Quality Dog Food'),
 Row(Score=1, Time=u'2012-09-06 17:00:00', Summary=u'Not as Advertised'),
 Row(Score=4, Time=u'2008-08-17 17:00:00', Summary=u'""Delight"" says it a'),
 Row(Score=2, Time=u'2011-06-12 17:00:00', Summary=u'Cough Medicine'),
 Row(Score=5, Time=u'2012-10-20 17:00:00', Summary=u'Great taffy'),
 Row(Score=4, Time=u'2012-07-11 17:00:00', Summary=u'Nice Taffy'),
 Row(Score=5, Time=u'2012-06-19 17:00:00', Summary=u'Great!  Just as good as the expensive brands!'),
 Row(Score=5, Time=u'2012-05-02 17:00:00', Summary=u'Wonderful, tasty taffy'),
 Row(Score=5, Time=u'2011-11-22 16:00:00', Summary=u'Yay Barley'),
 Row(Score=5, Time=u'2012-10-25 17:00:00', Summary=u'Healthy Dog Food')]

In [7]:
# Create transformer which creates several time columns based on the Time column
refineTime = SQLTransformer(
    statement="""
    SELECT  Score,
            Summary, 
            dayofmonth(Time) as Day, 
            month(Time) as Month, year(Time) as Year, 
            weekofyear(Time) as WeekNum, 
            date_format(Time, 'EEE') as WeekDay, 
            hour(Time) as HourOfDay, 
            IF(date_format(Time, 'EEE')='Sat' OR date_format(Time, 'EEE')='Sun', 1, 0) as Weekend, 
            CASE 
                WHEN month(TIME)=12 OR month(Time)<=2 THEN 'Winter' 
                WHEN month(TIME)>=3 OR month(Time)<=5 THEN 'Spring' 
                WHEN month(TIME)>=6 AND month(Time)<=9 THEN 'Summer' 
                ELSE 'Autumn' END as Seasson 
    FROM __THIS__""")

# Just inspect the data after 
refined = refineTime.transform(selected)
print(refined.head())
print("")
# Show the schema
refined.printSchema()

Row(Score=5, Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring')

root
 |-- Score: byte (nullable = false)
 |-- Summary: string (nullable = false)
 |-- Day: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- WeekNum: integer (nullable = true)
 |-- WeekDay: string (nullable = true)
 |-- HourOfDay: integer (nullable = true)
 |-- Weekend: integer (nullable = false)
 |-- Seasson: string (nullable = false)



In [8]:
# Remove neutral reviews and classify the Scores
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, udf
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, IDF, CountVectorizer

filterScore = SQLTransformer(
    statement="""
    SELECT  IF(Score<3,'NEGATIVE', 'POSITIVE') as Sentiment, Summary, Day, Month, Year,
            WeekNum, WeekDay, HourOfDay, Weekend, Seasson 
    FROM __THIS__ WHERE Score !=3 """)

# Inspect the data
filtered = filterScore.transform(refined)
filtered.head()

Row(Sentiment=u'POSITIVE', Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring')

In [9]:
# Tokenize the message
regexTokenizer = RegexTokenizer(inputCol="Summary",
                                outputCol="TokenizedSummary",
                                pattern="[, ]",
                                toLowercase=True)

# Inspect the data
tokenized = regexTokenizer.transform(filtered)
tokenized.head()

Row(Sentiment=u'POSITIVE', Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring', TokenizedSummary=[u'good', u'quality', u'dog', u'food'])

In [10]:
# Remove unnecessary words
stopWordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(),
                                    outputCol="CleanedSummary",
                                    caseSensitive=False)

# Inspect the data
stopWordsRemoved = stopWordsRemover.transform(tokenized)
stopWordsRemoved.head()

Row(Sentiment=u'POSITIVE', Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring', TokenizedSummary=[u'good', u'quality', u'dog', u'food'], CleanedSummary=[u'good', u'quality', u'dog', u'food'])

In [11]:
# Hash the words
countVectorizer = CountVectorizer(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex")

# Manually train the count vectorizer
countVecModel = countVectorizer.fit(stopWordsRemoved)
# See the vocabulary
print("Vocabulary size is " + str(len(countVecModel.vocabulary)))

print(countVecModel.vocabulary[:10])
# Inspect the data
vectorized = countVecModel.transform(stopWordsRemoved)
vectorized.take(5)

Vocabulary size is 68859
[u'great', u'good', u'best', u'love', u'coffee', u'tea', u'product', u'taste', u'delicious', u'excellent']


[Row(Sentiment=u'POSITIVE', Summary=u'Good Quality Dog Food', Day=26, Month=4, Year=2011, WeekNum=17, WeekDay=u'Tue', HourOfDay=17, Weekend=0, Seasson=u'Spring', TokenizedSummary=[u'good', u'quality', u'dog', u'food'], CleanedSummary=[u'good', u'quality', u'dog', u'food'], wordToIndex=SparseVector(68859, {1: 1.0, 10: 1.0, 12: 1.0, 35: 1.0})),
 Row(Sentiment=u'NEGATIVE', Summary=u'Not as Advertised', Day=6, Month=9, Year=2012, WeekNum=36, WeekDay=u'Thu', HourOfDay=17, Weekend=0, Seasson=u'Spring', TokenizedSummary=[u'not', u'as', u'advertised'], CleanedSummary=[u'advertised'], wordToIndex=SparseVector(68859, {605: 1.0})),
 Row(Sentiment=u'POSITIVE', Summary=u'""Delight"" says it a', Day=17, Month=8, Year=2008, WeekNum=33, WeekDay=u'Sun', HourOfDay=17, Weekend=1, Seasson=u'Spring', TokenizedSummary=[u'""delight""', u'says', u'it', u'a'], CleanedSummary=[u'""delight""', u'says'], wordToIndex=SparseVector(68859, {422: 1.0, 42239: 1.0})),
 Row(Sentiment=u'NEGATIVE', Summary=u'Cough Medicine

In [12]:
# Create inverse document frequencies model
idf = IDF(inputCol=countVectorizer.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=1)

In [13]:
from pysparkling.ml import ColumnPruner, H2OGBM

# Create GBM model
gbm = H2OGBM(ratio=0.8,
             featuresCols=[idf.getOutputCol()],
             predictionCol="Sentiment")

In [14]:
# Remove all helper columns
colPruner = ColumnPruner(columns=[
        idf.getOutputCol(),
        stopWordsRemover.getOutputCol(),
        regexTokenizer.getOutputCol()])

In [None]:
#  Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[colSelect,
                            refineTime,
                            filterScore,
                            regexTokenizer,
                            stopWordsRemover,
                            countVectorizer,
                            idf,
                            gbm,
                            colPruner])

In [None]:
# Train the pipeline model
model = pipeline.fit(reviews_spark)

In [None]:
# Try predicting on the same input data
model.transform(reviews_spark).take(10)


In [None]:
# Save the pipeline model
model.write().overwrite().save("Pipeline.model")