In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lab-ml").getOrCreate()

In [2]:
spark

In [14]:
from pyspark.sql.functions import UserDefinedFunction
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorIndexer
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import RFormula
import matplotlib.pyplot as plt
import numpy as np
import datetime

## Import natural language processing toolkit 
import re
import nltk
import string
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from pyspark.sql.functions import udf


In [4]:
path = "s3://bailey-bucket-dtb/user_dedup.json"
reviews = spark.read.json(path)

In [8]:
reviews.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [23]:
reviews.groupBy('overall').agg(count('reviewerID').alias('count')).sort('overall').show()

+-------+--------+
|overall|   count|
+-------+--------+
|    1.0| 6712117|
|    2.0| 4265230|
|    3.0| 7049302|
|    4.0|15480820|
|    5.0|49169670|
+-------+--------+



In [24]:
sample_rev = reviews.sample(False, .01, 12345)

In [51]:
# remove punctuation
def remove_punct(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)
    return nopunct

# binarize rating
def convert_rating(rating):
    rating = int(rating)
    if rating >=4: return 1
    else: return 0

# udf
punct_remover = udf(lambda x: remove_punct(x))
rating_convert = udf(lambda x: convert_rating(x))

# apply to review raw data
review_df = sample_rev.select('asin', 'helpful', 'overall', 'reviewerID', punct_remover('reviewText'), rating_convert('overall'), 'reviewTime', 'summary', 'unixReviewTime') #, rating_convert('overall')

# review_df = review_df.withColumnRenamed('<lambda>(reviewText)', 'reviewText')\
#                      .withColumn('label', review_df["<lambda>(overall)"].cast(IntegerType()))\
#                      .drop('<lambda>(overall)')\
#                      .limit(1000000)

review_df.show(5)

+----------+-------+-------+--------------------+--------------------+-----------------+-----------+--------------------+--------------+
|      asin|helpful|overall|          reviewerID|<lambda>(reviewText)|<lambda>(overall)| reviewTime|             summary|unixReviewTime|
+----------+-------+-------+--------------------+--------------------+-----------------+-----------+--------------------+--------------+
|B003ESE4TI| [0, 0]|    5.0|A000063614T1OE0BU...|I buy these for m...|                1| 04 5, 2013|         Great value|    1365120000|
|B004GWQBWY| [0, 0]|    5.0|A00009661LC9LQPGK...|Loftek is an awes...|                1|10 26, 2012|      Awsome Camera!|    1351209600|
|1479294608| [1, 1]|    5.0|A00009921ASVLX5LO...|This was a great ...|                1|03 23, 2014|         great book!|    1395532800|
|B003EO1H7E| [0, 0]|    5.0|A000187635I595IAV...|I am a newbie at ...|                1|01 26, 2013|Was asked for my ...|    1359158400|
|B00I8NZWWW| [0, 1]|    5.0|A000387412WV6

In [52]:
review_df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- <lambda>(reviewText): string (nullable = true)
 |-- <lambda>(overall): string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [53]:
review_df.groupBy('overall').agg(count('reviewerID').alias('count')).sort('overall').show()

+-------+------+
|overall| count|
+-------+------+
|    1.0| 67031|
|    2.0| 42286|
|    3.0| 70503|
|    4.0|153976|
|    5.0|492050|
+-------+------+



In [54]:
from pyspark.ml.feature import * 
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# tokenize
tok = Tokenizer(inputCol="<lambda>(reviewText)", outputCol="words")
review_tokenized = tok.transform(review_df)

# remove stop words
stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
review_tokenized = stopword_rm.transform(review_tokenized)

review_tokenized.select('<lambda>(reviewText)', 'words', 'words_nsw').show(5)

+--------------------+--------------------+--------------------+
|<lambda>(reviewText)|               words|           words_nsw|
+--------------------+--------------------+--------------------+
|I buy these for m...|[i, buy, these, f...|[buy, om, every, ...|
|Loftek is an awes...|[loftek, is, an, ...|[loftek, awesome,...|
|This was a great ...|[this, was, a, gr...|[great, book, , e...|
|I am a newbie at ...|[i, am, a, newbie...|[newbie, buying, ...|
|I have just recei...|[i, have, just, r...|[received, , , , ...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [41]:
# add ngram column
n = 3
ngram = NGram(inputCol = 'words', outputCol = 'ngram', n = n)
add_ngram = ngram.transform(review_tokenized)

# generate the top frequent ngram
ngrams = add_ngram.rdd.flatMap(lambda x: x[-1]).filter(lambda x: len(x.split())==n)
ngram_tally = ngrams.map(lambda x: (x, 1))\
                      .reduceByKey(lambda x,y: x+y)\
                      .sortBy(lambda x: x[1], ascending=False)\
                      .filter(lambda x: x[1]>=20)
ngram_list = ngram_tally.map(lambda x: x[0]).collect()

In [49]:
# replace the word with selected ngram
def ngram_concat(text):
    text1 = text.lower()
    for ngram in ngram_list:
        return text1.replace(ngram, ngram.replace(' ', '_'))

ngram_df = udf(lambda x: ngram_concat(x))
# ngram_df = review_tokenized.select(ngram_df('text'), 'label')\
#                           .withColumnRenamed('<lambda>(text)', 'text')

# tokenize and remove stop words with ngram
tok = Tokenizer(inputCol="<lambda>(reviewText)", outputCol="words")
review_tokenized = tok.transform(review_df)
tokenized_ngram = tok.transform(ngram_df)
tokenized_ngram = stopword_rm.transform(tokenized_ngram)

stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
review_tokenized = stopword_rm.transform(review_tokenized)

# count vectorizer and tfidf
cv = CountVectorizer(inputCol='words_nsw', outputCol='tf')
cvModel = cv.fit(review_tokenized)
count_vectorized = cvModel.transform(review_tokenized)

tfidfModel = idf.fit(count_vectorized)
tfidf_df = tfidfModel.transform(count_vectorized)

AttributeError: 'function' object has no attribute '_jdf'