## Sentiment Analysis on reviews, see if they are positive or negative
 ### Binary Classification

### Exploring the Amazon Dataset

In [3]:
parquetFile = sqlContext.read.parquet("databricks-datasets/amazon/data20K/part-r-00000-112e73de-1ab1-447b-b167-0919dd731adf.gz.parquet")
parquetFile.take(10)

In [4]:
result = parquetFile.select("rating", "review").groupBy("rating").count().orderBy("rating")
result.show()

In [5]:
display(result)

In [6]:
parquetFile.registerTempTable("amazon")
display(sqlContext.sql("SELECT rating, COUNT(*) as cnt FROM amazon GROUP BY rating ORDER BY rating"))

In [7]:
# cuttof at 4 so anything greater is +, lower is -
display(sqlContext.sql("SELECT IF(rating>4, 'good', 'bad') AS rating, COUNT(*) as cnt FROM amazon GROUP BY rating ORDER BY rating"))

## Featurization
Look for common words in review

In [9]:
from pyspark.sql.functions import *
df = parquetFile.select("*").where(col("review").like("%return%")).alias("has_word")
df

In [10]:
sqlContext.registerDataFrameAsTable(df, "df")
query = """SELECT *, CASE WHEN 
    df.review like ('%return%') THEN TRUE ELSE FALSE END AS has_word 
    FROM df"""

result = sqlContext.sql(query)
result

In [11]:
df2 = result.crosstab("rating", "has_word")
# this will give the occurance of the word amazing
df2.show()

## ML Pipeline

In [13]:
from pyspark.ml import *
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.tuning import *
from pyspark.ml.regression import *

bin = Binarizer(inputCol = "rating", outputCol = "label", threshold = 4.5) #Positive Reviews > 4.5 threshhold
tok = Tokenizer(inputCol = "review", outputCol = "words")
hashTF = HashingTF(inputCol = tok.getOutputCol(), numFeatures = 1000, outputCol = "features")
lr = LogisticRegression(maxIter = 10, regParam = 0.0001, elasticNetParam = 1.0)
pipeline = Pipeline(stages = [bin, tok, hashTF, lr])

In [14]:
df = table("amazon")
df

In [15]:
model = pipeline.fit(df)
display(model.transform(df).select("label", "prediction", "probability", "review"))