In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lab-ml").getOrCreate()

In [3]:
spark

In [4]:
from pyspark.sql.functions import UserDefinedFunction
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorIndexer
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import RFormula
import matplotlib.pyplot as plt
import numpy as np
import datetime

## Import natural language processing toolkit 
import re
import nltk
import string
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from pyspark.sql.functions import udf


In [5]:
path = "s3://bailey-bucket-dtb/user_dedup.json"
reviews = spark.read.json(path)

In [6]:
reviews.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [7]:
reviews.groupBy('overall').agg(count('reviewerID').alias('count')).sort('overall').show()

+-------+--------+
|overall|   count|
+-------+--------+
|    1.0| 6712117|
|    2.0| 4265230|
|    3.0| 7049302|
|    4.0|15480820|
|    5.0|49169670|
+-------+--------+



In [8]:
sample_rev = reviews.sample(False, .001, 12345)

In [9]:
# remove punctuation
def remove_punct(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)
    return nopunct

# binarize rating
def convert_rating(rating):
    rating = int(rating)
    if rating >=4: return 1
    else: return 0

# udf
punct_remover = udf(lambda x: remove_punct(x))
rating_convert = udf(lambda x: convert_rating(x))

# apply to review raw data
review_df = sample_rev.select('asin', 'helpful', 'overall', 'reviewerID', punct_remover('reviewText'), rating_convert('overall'), 'reviewTime', 'summary', 'unixReviewTime') #, rating_convert('overall')

# review_df = review_df.withColumnRenamed('<lambda>(reviewText)', 'reviewText')\
#                      .withColumn('label', review_df["<lambda>(overall)"].cast(IntegerType()))\
#                      .drop('<lambda>(overall)')\
#                      .limit(1000000)

review_df.show(5)

+----------+-------+-------+--------------------+--------------------+-----------------+-----------+--------------------+--------------+
|      asin|helpful|overall|          reviewerID|<lambda>(reviewText)|<lambda>(overall)| reviewTime|             summary|unixReviewTime|
+----------+-------+-------+--------------------+--------------------+-----------------+-----------+--------------------+--------------+
|B003EO1H7E| [0, 0]|    5.0|A000187635I595IAV...|I am a newbie at ...|                1|01 26, 2013|Was asked for my ...|    1359158400|
|B00EBQRSTK| [0, 0]|    4.0|A00316981NM2QRXZ3...|It is entertainin...|                1|09 28, 2013|        entertaining|    1380326400|
|0399159606| [0, 0]|    5.0|A00338282E99B8OR2...|These series are ...|                1|07 24, 2013|     Wonderful Book!|    1374624000|
|1439876363| [0, 1]|    5.0|A00418961HZF1HI8M...|I would give   st...|                1|02 15, 2014| Meet my expectation|    1392422400|
|B008E6ZXA4| [0, 0]|    5.0|A0057832QE3XH

In [10]:
review_df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- <lambda>(reviewText): string (nullable = true)
 |-- <lambda>(overall): string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [11]:
review_df.groupBy('overall').agg(count('reviewerID').alias('count')).sort('overall').show()

+-------+-----+
|overall|count|
+-------+-----+
|    1.0| 6580|
|    2.0| 4273|
|    3.0| 6973|
|    4.0|15381|
|    5.0|48984|
+-------+-----+



In [56]:
# from pyspark.ml.feature import * 
# from nltk.stem.porter import *
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.corpus import stopwords

# # tokenize
# tok = Tokenizer(inputCol="<lambda>(reviewText)", outputCol="words")
# review_tokenized = tok.transform(review_df)

# # remove stop words
# stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
# review_tokenized = stopword_rm.transform(review_tokenized)

# review_tokenized.select('<lambda>(reviewText)', 'words', 'words_nsw').show(5)

In [41]:
# # add ngram column
# n = 3
# ngram = NGram(inputCol = 'words', outputCol = 'ngram', n = n)
# add_ngram = ngram.transform(review_tokenized)

# # generate the top frequent ngram
# ngrams = add_ngram.rdd.flatMap(lambda x: x[-1]).filter(lambda x: len(x.split())==n)
# ngram_tally = ngrams.map(lambda x: (x, 1))\
#                       .reduceByKey(lambda x,y: x+y)\
#                       .sortBy(lambda x: x[1], ascending=False)\
#                       .filter(lambda x: x[1]>=20)
# ngram_list = ngram_tally.map(lambda x: x[0]).collect()

In [55]:
# # replace the word with selected ngram
# def ngram_concat(text):
#     text1 = text.lower()
#     for ngram in ngram_list:
#         return text1.replace(ngram, ngram.replace(' ', '_'))

# ngram_df = udf(lambda x: ngram_concat(x))
# # ngram_df = review_tokenized.select(ngram_df('text'), 'label')\
# #                           .withColumnRenamed('<lambda>(text)', 'text')

# # tokenize and remove stop words with ngram
# tok = Tokenizer(inputCol="<lambda>(reviewText)", outputCol="words")
# review_tokenized = tok.transform(review_df)
# tokenized_ngram = tok.transform(ngram_df)
# tokenized_ngram = stopword_rm.transform(tokenized_ngram)

# stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
# review_tokenized = stopword_rm.transform(review_tokenized)

# # count vectorizer and tfidf
# cv = CountVectorizer(inputCol='words_nsw', outputCol='tf')
# cvModel = cv.fit(review_tokenized)
# count_vectorized = cvModel.transform(review_tokenized)

# tfidfModel = idf.fit(count_vectorized)
# tfidf_df = tfidfModel.transform(count_vectorized)

In [12]:
from pyspark.sql.functions import col, expr, udf, trim
from pyspark.sql.types import IntegerType
import re

remove_punctuation = udf(lambda line: re.sub('[^A-Za-z\s]', '', line))
make_binary = udf(lambda rating: 0 if rating in [1, 2] else 1, IntegerType())

reviews = (sample_rev
    .filter(col('overall').isin([1, 2, 3, 4, 5]))
    .withColumn('label', make_binary(col('overall')))
    .select(col('label').cast('int'), remove_punctuation('summary').alias('summary'))
    .filter(trim(col('summary')) != ''))

In [13]:
train, test = reviews.randomSplit([.8, .2], seed=123)

In [14]:
def multiply_dataset(dataset, n):
    return dataset if n <= 1 else dataset.union(multiply_dataset(dataset, n - 1))

reviews_good = train.filter('label == 1')
reviews_bad = train.filter('label == 0')

reviews_bad_multiplied = multiply_dataset(reviews_bad, reviews_good.count() / reviews_bad.count())


train_reviews = reviews_bad_multiplied.union(reviews_good)

In [15]:
accuracy = reviews_good.count() / float(train_reviews.count())
print('Always predicting 5 stars accuracy: {0}'.format(accuracy))

Always predicting 5 stars accuracy: 0.4851215548124133


In [17]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(inputCol='summary', outputCol='words')

pipeline = Pipeline(stages=[
    tokenizer, 
    StopWordsRemover(inputCol='words', outputCol='filtered_words'),
    HashingTF(inputCol='filtered_words', outputCol='rawFeatures', numFeatures=120000),
    IDF(inputCol='rawFeatures', outputCol='features'),
    LogisticRegression(regParam=.3, elasticNetParam=.01)
])

In [18]:
model = pipeline.fit(train_reviews)

KeyboardInterrupt: 

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

prediction = model.transform(test)
BinaryClassificationEvaluator().evaluate(prediction)

In [1]:
spark.stop()

NameError: name 'spark' is not defined